-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
101 lines (83 loc) · 3.49 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# ==============================================================================
# file: functions.py
# author: Allen Sun
# created date: 2024-06-09
# Copyright: Copyright 2024 Allen Sun. All rights reserved.
# License: MIT License
# Contact me: [email protected]
# Description: Scripts for storing functions.
# ==============================================================================
import os
import re
from concurrent.futures import ThreadPoolExecutor
import fitz # PyMuPDF library
def extract_words_from_niujin(path):
"""
Extract words from a file assumed to be a dictionary, filtering out non-words and duplicates.
Only words at the beginning of each line are considered.
"""
with open(path, "r", encoding='latin-1') as file:
text = file.read()
pattern = re.compile(r'^[\w-]+(?=\s|$)')
words = [line.strip().split()[0] for line in text.split('\n') if pattern.match(line)]
words = list(set(words)) # Remove duplicates
words.sort(key=len)
words = [word for word in words if "-" not in word] # Filter out hyphenated words
return words
def get_all_txt_files(path):
"""
Return a list of all .txt files in a directory, including subdirectories.
"""
return [os.path.join(root, file) for root, dirs, files in os.walk(path)
for file in files if file.lower().endswith('.txt')]
def get_all_pdf_files(path):
"""
Return a list of all .pdf files in a directory, including subdirectories.
"""
return [os.path.join(root, file) for root, dirs, files in os.walk(path)
for file in files if file.lower().endswith('.pdf')]
def process_text(content):
"""
Process the given text to remove digits, punctuation (except hyphens),
convert to lowercase, and split into a sorted list of unique words longer than 3 characters.
"""
content = content.lower().replace('-', '\n')
content = re.sub(r'\s+', '\n', content)
content = re.sub(r'[^\w\s-]', '', content)
content = re.sub(r'\d', '', content)
words = [line.strip() for line in content.split('\n') if len(line.strip()) > 3]
return '\n'.join(sorted(set(words), key=len))
def extract_text_from_pdf(pdf_path):
"""
Extract text from a PDF file using PyMuPDF.
"""
document = fitz.open(pdf_path)
text = ""
for page in document:
text += page.get_text() + "\n"
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if 'lines' in block:
for line in block["lines"]:
text += " ".join(span["text"] for span in line["spans"]) + "\n"
document.close()
return text
def check_word(word, dictionary_words):
"""
Check if a word is in the dictionary, returning the word if it exists.
"""
return word if word in dictionary_words else None
def check_target_words_in_oxford(target_words, dictionary_words):
"""
Check a list of target words against the Oxford dictionary words and write the found words to a file.
"""
max_workers = max(1, os.cpu_count() - 1)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = [future.result() for future in
(executor.submit(check_word, word, dictionary_words) for word in target_words) if future.result()]
with open("results.txt", "w") as file:
for word in results:
file.write(word + "\n")
print(f"已将自制词汇表输出至:{os.getcwd()}/results.txt")
print(f"自制词汇表单词数量:{len(results)}")
return results