Skip to content

Commit

Permalink
Merge pull request #8 from uwsampa/bmyerz/radish
Browse files Browse the repository at this point in the history
Radish/myriaL query for bigrams->tfidf
  • Loading branch information
bmyerz committed Jun 23, 2015
2 parents ecd77af + 3bd95b3 commit 67dfc71
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 0 deletions.
2 changes: 2 additions & 0 deletions radish/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
* input data in /shared/patents/bigrams-nfs/ngrams
* raco must output JSON+splits file input
6 changes: 6 additions & 0 deletions radish/catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Schemas corresponding to Myrial examples

{
'public:adhoc:ngrams' : [('word', 'STRING_TYPE'), ('filename','STRING_TYPE')],
'public:adhoc:tfidf' : [('term', 'STRING_TYPE'), ('document', 'STRING_TYPE'), ('tfidf', 'LONG_TYPE')],
}
48 changes: 48 additions & 0 deletions radish/tfidf.myl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@




--assume schema for bigrams
--filename term (bigram is term)

bigrams = scan(ngrams);

-- Frequency(t, d)
freq = select bigrams.word as term,
bigrams.filename as document,
count(bigrams.word) as freq
from bigrams; -- groups by term, filename

-- MaxFrequency(d) = max_{w \in d}(Frequency(w,d))
maxfreq = select freq.document as document,
max(freq.freq) as maxfreq
from freq; -- groups by document

-- term frequency
tf = select freq.term as term,
freq.document as document,
0.5 + 0.5*freq.freq/maxfreq.maxfreq as tf -- max over words in doc
from freq, maxfreq
where freq.document = maxfreq.document;

-- num documents
-- hardcoded N=300

invfreq = select bigrams.word as term,
COUNT(bigrams.filename) as numdocs
from bigrams; -- groups by term

-- how many?

idf = select invfreq.term as term,
log( FLOAT(300)/ invfreq.numdocs) as idf
from invfreq;


tfidf = select tf.term as term,
tf.document as document,
tf.tf * idf.idf as tfidf
from tf, idf
where tf.term = idf.term;

store(tfidf, tfidf);

0 comments on commit 67dfc71

Please sign in to comment.