From 9d5750e34403cfb5d04d54072bb2280c0a389c11 Mon Sep 17 00:00:00 2001 From: Bernhard Merkle Date: Sat, 4 Jan 2025 23:23:38 +0100 Subject: [PATCH 1/3] fix TfidfVectorizer(stop_words=list(stopwords)) --- ch05/Feature_Engineering_Similarity.ipynb | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ch05/Feature_Engineering_Similarity.ipynb b/ch05/Feature_Engineering_Similarity.ipynb index 6a04ae9..fef290a 100644 --- a/ch05/Feature_Engineering_Similarity.ipynb +++ b/ch05/Feature_Engineering_Similarity.ipynb @@ -764,9 +764,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "%%time\n", @@ -880,7 +878,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "blueprints", "language": "python", "name": "python3" }, @@ -894,7 +892,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.8" }, "toc": { "base_numbering": 1, From 1d660b989416b0436ee4cb7a1eb195ed8cf83ef5 Mon Sep 17 00:00:00 2001 From: Bernhard Merkle Date: Sat, 4 Jan 2025 23:24:38 +0100 Subject: [PATCH 2/3] Update TfidfVectorizer to use list of stopwords --- ch05/Feature_Engineering_Similarity.ipynb | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ch05/Feature_Engineering_Similarity.ipynb b/ch05/Feature_Engineering_Similarity.ipynb index fef290a..9343174 100644 --- a/ch05/Feature_Engineering_Similarity.ipynb +++ b/ch05/Feature_Engineering_Similarity.ipynb @@ -451,7 +451,7 @@ "source": [ "from spacy.lang.en.stop_words import STOP_WORDS as stopwords\n", "print(len(stopwords))\n", - "tfidf = TfidfVectorizer(stop_words=stopwords)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n", "dt" ] @@ -469,7 +469,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n", "dt" ] @@ -480,7 +480,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=.0001)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=.0001)\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n", "dt" ] @@ -498,7 +498,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), max_df=0.1)\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n", "dt" ] @@ -527,11 +527,11 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2)\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n", "print(dt.shape)\n", "print(dt.data.nbytes)\n", - "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,3), min_df=2)\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n", "print(dt.shape)\n", "print(dt.data.nbytes)" @@ -575,7 +575,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n", "dt = tfidf.fit_transform(headlines[\"lemmas\"].map(str))\n", "dt" ] @@ -586,7 +586,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n", "dt = tfidf.fit_transform(headlines[\"nav\"].map(str))\n", "dt" ] @@ -634,7 +634,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)\n", "dt = tfidf.fit_transform(headlines[\"lemmas\"].map(str))\n", "dt" ] @@ -690,7 +690,7 @@ "source": [ "# there are \"test\" headlines in the corpus\n", "stopwords.add(\"test\")\n", - "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, norm='l2')\n", + "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2, norm='l2')\n", "dt = tfidf.fit_transform(headlines[\"headline_text\"])" ] }, @@ -839,7 +839,7 @@ "metadata": {}, "outputs": [], "source": [ - "tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)\n", + "tfidf_word = TfidfVectorizer(stop_words=list(stopwords), min_df=1000)\n", "dt_word = tfidf_word.fit_transform(headlines[\"headline_text\"])" ] }, From cef1092a28d63a1e623af9744a883bef66000183 Mon Sep 17 00:00:00 2001 From: Bernhard Merkle Date: Sat, 4 Jan 2025 23:28:49 +0100 Subject: [PATCH 3/3] Add files to .gitignore for chapter 04 --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index b6e4761..61eaad5 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,8 @@ dmypy.json # Pyre type checker .pyre/ + +# files created by notebook instructions +ch04/lid.176.ftz +ch04/reddit_dataframe.pkl +ch04/reddit-selfposts.db