diff --git a/radish/README.md b/radish/README.md new file mode 100644 index 0000000..c2f7685 --- /dev/null +++ b/radish/README.md @@ -0,0 +1,2 @@ +* input data in /shared/patents/bigrams-nfs/ngrams +* raco must output JSON+splits file input diff --git a/radish/catalog.py b/radish/catalog.py new file mode 100644 index 0000000..75cf8c3 --- /dev/null +++ b/radish/catalog.py @@ -0,0 +1,6 @@ +# Schemas corresponding to Myrial examples + +{ + 'public:adhoc:ngrams' : [('word', 'STRING_TYPE'), ('filename','STRING_TYPE')], + 'public:adhoc:tfidf' : [('term', 'STRING_TYPE'), ('document', 'STRING_TYPE'), ('tfidf', 'LONG_TYPE')], +} diff --git a/radish/tfidf.myl b/radish/tfidf.myl new file mode 100644 index 0000000..6ba47c0 --- /dev/null +++ b/radish/tfidf.myl @@ -0,0 +1,48 @@ + + + + +--assume schema for bigrams +--filename term (bigram is term) + +bigrams = scan(ngrams); + +-- Frequency(t, d) +freq = select bigrams.word as term, + bigrams.filename as document, + count(bigrams.word) as freq + from bigrams; -- groups by term, filename + +-- MaxFrequency(d) = max_{w \in d}(Frequency(w,d)) +maxfreq = select freq.document as document, + max(freq.freq) as maxfreq + from freq; -- groups by document + +-- term frequency +tf = select freq.term as term, + freq.document as document, + 0.5 + 0.5*freq.freq/maxfreq.maxfreq as tf -- max over words in doc + from freq, maxfreq + where freq.document = maxfreq.document; + +-- num documents +-- hardcoded N=300 + +invfreq = select bigrams.word as term, + COUNT(bigrams.filename) as numdocs + from bigrams; -- groups by term + + -- how many? + +idf = select invfreq.term as term, + log( FLOAT(300)/ invfreq.numdocs) as idf + from invfreq; + + +tfidf = select tf.term as term, + tf.document as document, + tf.tf * idf.idf as tfidf + from tf, idf + where tf.term = idf.term; + +store(tfidf, tfidf);