-
Notifications
You must be signed in to change notification settings - Fork 0
/
note_to_json_daily.py
91 lines (63 loc) · 2.66 KB
/
note_to_json_daily.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os, re, json
import meilisearch
import html
daily_index_id = 'daily'
linely_index_id = 'linely'
daily_id_num = 0
linely_id_num = 0
daily_list = []
lines_list = []
path = 'F:\\Anaconda_Play\\小米便签批量导出\\2022-7-29-2\\sub'
path = 'F:\\Anaconda_Play\\小米便签批量导出\\2022-7-29-2\\'
for file in os.listdir(path):
# print(file)
if not '.txt' in file:
continue
with open(path + '\\' + file, 'r', encoding='utf-8') as f:
note_content = f.read()
# Convert all named and numeric character references (e.g. >, >, >) in the string s to the corresponding Unicode characters
note_content = html.unescape(note_content)
note_content = re.sub(r'(\d{1,2}月\d{1,2}日)','$$$\n'+r'\1',note_content)
for day in note_content.split('$$$\n'):
# print(day)
pattern = re.compile(r'\d{,2}月\d{,2}日')
date = re.findall(pattern, day)
if not date:
continue # 如果没有日期,则跳过
date = re.findall(pattern, day)[0] # 取出日期
day = re.sub(r'\n','__LINEBREAK__', day) # 替换换行符
daily_list.append({
'id': daily_id_num,
'date':date,
'file_name': file,
'content':day})
# 'content':'<div>' + day + '</div>'})
daily_id_num += 1
for line in day.split('__LINEBREAK__'):
lines_list.append({
'id': linely_id_num,
'day_index': daily_id_num - 1,
'date':date,
'file_name': file,
'content':line})
linely_id_num += 1
# print(daily_list)
client = meilisearch.Client('http://127.0.0.1:7700')
with open( daily_index_id + '.json','w', encoding = 'utf-8') as f:
json.dump(daily_list, f, indent= 4, ensure_ascii=False)
with open( daily_index_id + '.json', encoding = 'utf-8') as f:
data = json.load(f)
client.index( daily_index_id ).add_documents(data)
with open( linely_index_id + '.json','w', encoding = 'utf-8') as f:
json.dump(lines_list, f, indent= 4, ensure_ascii=False)
with open( linely_index_id + '.json', encoding = 'utf-8') as f:
data = json.load(f)
client.index( linely_index_id ).add_documents(data)
#参考,备用:
# client.index('movies').add_documents([{
# 'id': 287947,
# 'title': 'Shazam',
# 'poster': 'https://image.tmdb.org/t/p/w1280/xnopI5Xtky18MPhK40cZAGAOVeV.jpg',
# 'overview': 'A boy is given the ability to become an adult superhero in times of need with a single magic word.',
# 'release_date': '2019-03-23'
# }])