Skip to content

Commit

Permalink
fixes to the query and catalog
Browse files Browse the repository at this point in the history
  • Loading branch information
bmyerz committed Jun 23, 2015
1 parent 2924cc8 commit 3bd95b3
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
2 changes: 1 addition & 1 deletion radish/catalog.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Schemas corresponding to Myrial examples

{
'public:adhoc:bigrams' : [('term', 'STRING_TYPE'), ('filename','STRING_TYPE')],
'public:adhoc:ngrams' : [('word', 'STRING_TYPE'), ('filename','STRING_TYPE')],
'public:adhoc:tfidf' : [('term', 'STRING_TYPE'), ('document', 'STRING_TYPE'), ('tfidf', 'LONG_TYPE')],
}
14 changes: 8 additions & 6 deletions radish/tfidf.myl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
--assume schema for bigrams
--filename term (bigram is term)

bigrams = scan(bigrams);
bigrams = scan(ngrams);

-- freqs
freq = select bigrams.term as term,
-- Frequency(t, d)
freq = select bigrams.word as term,
bigrams.filename as document,
count(bigrams.term) as freq
count(bigrams.word) as freq
from bigrams; -- groups by term, filename

-- max freqs
-- MaxFrequency(d) = max_{w \in d}(Frequency(w,d))
maxfreq = select freq.document as document,
max(freq.freq) as maxfreq
from freq; -- groups by document
Expand All @@ -28,10 +28,12 @@ tf = select freq.term as term,
-- num documents
-- hardcoded N=300

invfreq = select bigrams.term as term,
invfreq = select bigrams.word as term,
COUNT(bigrams.filename) as numdocs
from bigrams; -- groups by term

-- how many?

idf = select invfreq.term as term,
log( FLOAT(300)/ invfreq.numdocs) as idf
from invfreq;
Expand Down

0 comments on commit 3bd95b3

Please sign in to comment.