Added stopword list, sbatch script & input mapper

Jevin's stop words are now in stopwords.txt claims_mapper.py contains the mapper we need to consume the initial claims splits slurm_hadoop_tfidf.sb is an sbatch script that should request the hdfs name node / yarn manager and run mapred_tfidf.py
uwsampa · Apr 19, 2015 · 0b0c6f5 · 0b0c6f5
1 parent c532367
commit 0b0c6f5
Show file tree

Hide file tree

Showing 3 changed files with 711 additions and 0 deletions.
diff --git a/claims_mapper.py b/claims_mapper.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+import sys
+import map_reduce_utils as mru
+
+# this should become an arg to map_claims
+INPUT_KV_DELIMITER = '~~'
+
+def map_claims(input=sys.stdin, output=sys.stdout, kv_delim=INPUT_KV_DELIMITER):
+    for line in input:
+        key, value = line.strip().split(kv_delim)
+        filename = key.strip()
+        contents = mru.clean_text(value);
+        key = {'filename': filename}
+        contents = {'words': [word for word in contents]}
+        mru.reducer_emit(key, value, output)
+
+
+if __name__ == '__main__':
+    map_claims()
diff --git a/slurm_hadoop_tfidf.sb b/slurm_hadoop_tfidf.sb
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# run this slurm script by executing:
+# sbatch /shared/patents/nltk-hadoop/slurm_hadoop_tfidf.sb
+# (or whatever the absolute path is)
+
+# sesarch for this to find it in `sinfo`
+#SBATCH --job-name=mapreduce_tfidf_patent_claims
+
+# request the hdfs name node and yarn manager
+#SBATCH --nodelist=n05
+
+HDFS_CLAIMS_PATH=hdfs:/shared/patents/claims_splits
+OUTPUT=hdfs:/shared/pattents
+
+# still need to pass customm stop words list here as well
+srun ./mapred_tfidf.py -f -i $HDFS_CLAIMS_PATH -o $OUTPUT