-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobs_controller.py
executable file
·121 lines (104 loc) · 4.57 KB
/
jobs_controller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import random
import utils
from classifier import job_classifier
class JobsController:
JOBS_DIR = os.path.join(".", "data")
USER_DIR = os.path.join(".", "user")
def __init__(self, retrain=False):
self.job_ads = list()
tot_cat_names = utils.populate_jobs(self.job_ads, JobsController.JOBS_DIR)
if "user" in os.listdir():
user_cats = utils.populate_jobs(self.job_ads, JobsController.USER_DIR)
tot_cat_names.extend(user_cats)
self.job_ads.sort(key=lambda x: x["webindex"])
self.cat_names = list()
self.__update_categories(tot_cat_names)
self.clf = job_classifier.load_model(
"job_classifier.mdl", "vectorizer.transformer"
)
self.retrain = retrain
def __update_categories(self, cats: list):
new_cats = self.cat_names.copy()
new_cats.extend(cats)
self.cat_names = sorted(list(set(new_cats)))
clean_ends = [cat_name.replace(" & ", " ") for cat_name in self.cat_names]
endpoints = list()
for cat in clean_ends:
string_to_take = 2
done = False
while not done:
string_to_take = string_to_take + 1
candidate = "".join([c[0:string_to_take] for c in cat.split(" ")])
if candidate not in endpoints:
endpoints.append(candidate)
done = True
self.cat_endpoints = ["".join(cat).lower() for cat in endpoints]
def job_count(self):
return len(self.job_ads)
def categories(self):
return [(n, "section/" + e) for n, e in zip(self.cat_names, self.cat_endpoints)]
def predict(self, title, description):
return self.clf.predict(title, description)
def get_job(self, webindex):
return [job for job in self.job_ads if job["webindex"] == webindex]
def add_new_job(self, job_dict: dict):
job = job_dict.copy()
# Generate web index
job["webindex"] = self.get_new_webindex()
# Save to file. Categories are saved as <arg1>_<arg2> so all spaces need to be replaced
# We codify & as _ and - as space
job["category"] = job["category"].strip()
cls_dir = job["category"].replace(" & ", "_").replace(" ", "-")
# Create user directory
if "user" not in os.listdir():
os.mkdir(JobsController.USER_DIR)
# Create category directory
if cls_dir not in os.listdir(JobsController.USER_DIR):
os.mkdir(os.path.join(JobsController.USER_DIR, cls_dir))
# Create text to write
w_out = [
f"Title: {job['title']}\n",
f"Webindex: {job['webindex']}\n",
f"Company: {job['company']}\n",
f"Description: {' '.join(job['description'].splitlines())}\n",
]
# Files are encoded to bytes
w_out = [bytes(line, "utf-8") for line in w_out]
with open(
os.path.join(JobsController.USER_DIR, cls_dir, f"{job['webindex']}.txt"),
"wb",
) as f:
f.writelines(w_out)
# Add job to current running jobs list and categories
self.job_ads.append(job)
self.job_ads.sort(key=lambda x: x["webindex"])
if job["category"] not in self.cat_names:
self.__update_categories([job["category"]])
self.__retrain_model()
return job["webindex"]
def get_new_webindex(self):
taken_indices = set([job["webindex"] for job in self.job_ads])
new_ind = random.randint(10000000, 99999999)
# Probably ok for a while (776/89999999) combinations
while new_ind in taken_indices:
new_ind = random.randint(10000000, 99999999)
return str(new_ind)
def get_job_section(self, section):
section_category = "All available"
job_section = self.job_ads
found = True
if section is not None:
found = False
for cat_name, cat_endpoint in zip(self.cat_names, self.cat_endpoints):
if section == cat_endpoint:
found = True
section_category = cat_name
job_section = [
job for job in self.job_ads if job["category"] == cat_name
]
return job_section, section_category, found
def __retrain_model(self):
if self.retrain and self.job_count() % 10 == 0:
print("Retraining model...")
self.clf.retrain(self.job_ads)