-
Notifications
You must be signed in to change notification settings - Fork 3
/
cluster_documents.py
99 lines (76 loc) · 3.54 KB
/
cluster_documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Import libraries
import os
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# Define some helper functions
def read_files(path):
# Read all text files from a given path and return a list of file names and contents
file_names = []
file_contents = []
for file in os.listdir(path):
if file.endswith(".txt"):
file_names.append(file)
with open(os.path.join(path, file), encoding="utf8", errors="ignore") as f:
file_contents.append(f.read())
return file_names, file_contents
def preprocess(text):
# Preprocess a given text by removing stopwords, punctuation, numbers, and stemming
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
text = text.lower() # convert to lower case
text = re.sub(r"[^\w\s]", "", text) # remove punctuation
text = re.sub(r"\d+", "", text) # remove numbers
text = [stemmer.stem(word) for word in text.split() if word not in stop_words] # remove stopwords and stem words
text = " ".join(text) # join words back to text
return text
def print_clusters(model, feature_names, n_top_words):
# Print the cluster labels and the top words for each cluster
print("Cluster labels:")
print(model.labels_)
print()
print("Top words per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
for i in range(model.n_clusters):
print("Cluster %d:" % i, end="")
for ind in order_centroids[i, :n_top_words]:
print(" %s" % feature_names[ind], end="")
print()
def save_clusters(file_names, labels, path):
# Save the cluster labels to a text file with the same filename in a given path
for i in range(len(file_names)):
file_name = file_names[i]
label = labels[i]
with open(os.path.join(path, file_name), "w") as f:
f.write(str(label))
# Define some parameters
input_path = "C:\\python\\autoindex\\txt_output" # input directory for text files
output_path = "C:\\python\\autoindex\\category" # output directory for cluster labels
n_components = 10 # number of topics to extract using LSA
n_clusters = 5 # number of clusters to form using K-means
n_top_words = 10 # number of top words to display for each cluster
# Read the text files from the input directory
file_names, file_contents = read_files(input_path)
# Create a pandas dataframe to store the file names and contents
data = pd.DataFrame({"file_name": file_names, "file_content": file_contents})
# Preprocess the file contents
data["file_content"] = data["file_content"].apply(preprocess)
# Create a document-term matrix using TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data["file_content"])
# Apply LSA using truncated SVD to reduce the dimensionality and extract latent topics
svd = TruncatedSVD(n_components=n_components)
X_topics = svd.fit_transform(X)
# Cluster the documents based on their topic scores using K-means
km = KMeans(n_clusters=n_clusters, random_state=0)
km.fit(X_topics)
# Print the cluster labels and the top words for each cluster to the console
feature_names = vectorizer.get_feature_names_out()
print_clusters(km, feature_names, n_top_words)
# Save the cluster labels to a text file with the same filename in the output directory
save_clusters(data["file_name"], km.labels_, output_path)