Skip to content

Commit

Permalink
chapters 1 and 2 written.
Browse files Browse the repository at this point in the history
  • Loading branch information
lakshmanok committed Mar 17, 2017
1 parent a024716 commit 374450a
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 8 deletions.
1 change: 1 addition & 0 deletions courses/unstructured/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tmp
4 changes: 2 additions & 2 deletions courses/unstructured/ML-Tests-Solution.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
"\n",
"# Running Vision API\n",
"import base64\n",
"IMAGE=\"gs://cpb103-public-files/noirbree.jpg\"\n",
"IMAGE=\"gs://BUCKET_NAME/unstructured/noirbree.jpg\"\n",
"vservice = build('vision', 'v1', developerKey=APIKEY)\n",
"request = vservice.images().annotate(body={\n",
" 'requests': [{\n",
Expand Down Expand Up @@ -160,7 +160,7 @@
}
],
"source": [
"alice = sc.textFile(\"gs://cpb103-public-files/alice-short-transformed.txt\")\n",
"alice = sc.textFile(\"gs://BUCKET_NAME/unstructured/alice-short-transformed.txt\")\n",
"alice = alice.map(lambda x: x.split(\".\"))\n",
" \n",
"for eachSentence in alice.take(10):\n",
Expand Down
4 changes: 2 additions & 2 deletions courses/unstructured/PySpark-Test-Solution.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
"from pyspark.sql.types import *\n",
"header = 'animal,name'\n",
"schema = StructType([StructField(colname, StringType(), True) for colname in header.split(',')])\n",
"pets = spark.read.schema(schema).csv('gs://cpb103-public-files/pets.txt')\n",
"pets = spark.read.schema(schema).csv('gs://BUCKET_NAME/unstructured/pets.txt')\n",
"\n",
"pets.createOrReplaceTempView('pets')\n",
"countsByPet = spark.sql('SELECT animal, COUNT(*) from pets GROUP BY animal')\n",
Expand All @@ -85,7 +85,7 @@
}
],
"source": [
"file = sc.textFile(\"gs://cpb103-public-files/pets.txt\")\n",
"file = sc.textFile(\"gs://BUCKET_NAME/unstructured/pets.txt\")\n",
"\n",
"pets = file.map(lambda s: s.split(\",\")).map(lambda x : (x[0], [x[1]]))\n",
"petsByType = pets.reduceByKey(lambda a, b: a + b)\n",
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from pyspark import SparkContext
sc = SparkContext("local")

rdd = sc.parallelize(range(1000), 10)
print rdd.mean()

file = sc.textFile("gs://cpb103-public-files/lab2a-input.txt")
file = sc.textFile("gs://BUCKET_NAME/unstructured/lab2-input.txt")
dataLines = file.map(lambda s: s.split(",")).map(lambda x : (x[0], [x[1]]))
print dataLines.take(100)

databyKey = dataLines.reduceByKey(lambda a, b: a + b)
print databyKey.take(100)

countByKey = databyKey.map(lambda (k,v): (k, len(v)))
print countByKey.take(100)
22 changes: 22 additions & 0 deletions courses/unstructured/replace_and_upload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/sh

if [ "$#" -ne 1 ]; then
echo "Usage: ./replace_and_upload.sh bucket-name"
exit
fi

BUCKET=$1
echo "replacing bucket references to $BUCKET and copying to gs://$BUCKET/unstructured"

# replace
TEMP=tmp
rm -rf $TEMP
mkdir $TEMP
for FILE in $(ls -1 *.py *.ipynb); do
echo $FILE
cat $FILE | sed "s/BUCKET_NAME/$BUCKET/g" > $TEMP/$FILE
done

# first the originals, then the modified
gsutil -m cp * gs://$BUCKET/unstructured
gsutil -m cp $TEMP/* gs://$BUCKET/unstructured

0 comments on commit 374450a

Please sign in to comment.