From 9d5750e34403cfb5d04d54072bb2280c0a389c11 Mon Sep 17 00:00:00 2001
From: Bernhard Merkle <bernhard.merkle@gmail.com>
Date: Sat, 4 Jan 2025 23:23:38 +0100
Subject: [PATCH 1/3] fix TfidfVectorizer(stop_words=list(stopwords))

---
 ch05/Feature_Engineering_Similarity.ipynb | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ch05/Feature_Engineering_Similarity.ipynb b/ch05/Feature_Engineering_Similarity.ipynb
index 6a04ae9..fef290a 100644
--- a/ch05/Feature_Engineering_Similarity.ipynb
+++ b/ch05/Feature_Engineering_Similarity.ipynb
@@ -764,9 +764,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -880,7 +878,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "blueprints",
    "language": "python",
    "name": "python3"
   },
@@ -894,7 +892,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.8"
   },
   "toc": {
    "base_numbering": 1,

From 1d660b989416b0436ee4cb7a1eb195ed8cf83ef5 Mon Sep 17 00:00:00 2001
From: Bernhard Merkle <bernhard.merkle@gmail.com>
Date: Sat, 4 Jan 2025 23:24:38 +0100
Subject: [PATCH 2/3] Update TfidfVectorizer to use list of stopwords

---
 ch05/Feature_Engineering_Similarity.ipynb | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/ch05/Feature_Engineering_Similarity.ipynb b/ch05/Feature_Engineering_Similarity.ipynb
index fef290a..9343174 100644
--- a/ch05/Feature_Engineering_Similarity.ipynb
+++ b/ch05/Feature_Engineering_Similarity.ipynb
@@ -451,7 +451,7 @@
    "source": [
     "from spacy.lang.en.stop_words import STOP_WORDS as stopwords\n",
     "print(len(stopwords))\n",
-    "tfidf = TfidfVectorizer(stop_words=stopwords)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -469,7 +469,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -480,7 +480,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=.0001)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=.0001)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -498,7 +498,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), max_df=0.1)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -527,11 +527,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "print(dt.shape)\n",
     "print(dt.data.nbytes)\n",
-    "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,3), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "print(dt.shape)\n",
     "print(dt.data.nbytes)"
@@ -575,7 +575,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n",
     "dt = tfidf.fit_transform(headlines[\"lemmas\"].map(str))\n",
     "dt"
    ]
@@ -586,7 +586,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n",
     "dt = tfidf.fit_transform(headlines[\"nav\"].map(str))\n",
     "dt"
    ]
@@ -634,7 +634,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"lemmas\"].map(str))\n",
     "dt"
    ]
@@ -690,7 +690,7 @@
    "source": [
     "# there are \"test\" headlines in the corpus\n",
     "stopwords.add(\"test\")\n",
-    "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, norm='l2')\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2, norm='l2')\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])"
    ]
   },
@@ -839,7 +839,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)\n",
+    "tfidf_word = TfidfVectorizer(stop_words=list(stopwords), min_df=1000)\n",
     "dt_word = tfidf_word.fit_transform(headlines[\"headline_text\"])"
    ]
   },

From cef1092a28d63a1e623af9744a883bef66000183 Mon Sep 17 00:00:00 2001
From: Bernhard Merkle <bernhard.merkle@gmail.com>
Date: Sat, 4 Jan 2025 23:28:49 +0100
Subject: [PATCH 3/3] Add files to .gitignore for chapter 04

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index b6e4761..61eaad5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# files created by notebook instructions
+ch04/lid.176.ftz
+ch04/reddit_dataframe.pkl
+ch04/reddit-selfposts.db