-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhasher.py
138 lines (117 loc) · 5.79 KB
/
hasher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import csv
import hashlib
import os
from math import ceil
prompt_name = "simple"
def generate_hash(row, hash_length=16):
"""
Generates a unique hash from the specified columns in a row.
Args:
row (list): A list representing a row in the CSV file.
hash_length (int): The desired length of the shortened hash (default is 16).
Returns:
str: The generated hash value.
"""
# values = [str(row[col]) for col in columns]
# combined_value = ''.join(str(value) for value in row)
combined_value = row[5]
full_hash = hashlib.sha256(combined_value.encode()).hexdigest()
return full_hash[:hash_length]
def check_text(text):
"""
This function checks if a text is empty, blank, or contains a specific string.
Args:
text: The text to be checked.
Returns:
False if the text is empty, blank, or contains the specific string, True otherwise.
"""
return not (not text or text.isspace() or "Repetition_penalty should be greater than 0.0" in text)
def split_list_n(data, n):
"""
Splits a list into n sublists with (almost) the same number of elements.
Args:
data: The list to be split.
n: The number of sublists to create.
Returns:
A list of sublists.
"""
if n <= 0:
raise ValueError("Number of sublists (n) must be positive")
group_size = ceil(len(data) / n) # Use ceil to ensure enough elements per sublist
sublists = []
for i in range(n):
start_index = i * group_size
end_index = min(start_index + group_size, len(data)) # Handle potential last sublist
sublists.append(data[start_index:end_index])
return sublists
# TODO: make input_directory works with subdirectory
def process_csv_files(input_directory, output_directory, output_directory_short, output_directory_human, split_n):
os.makedirs(output_directory, exist_ok=True)
total_count = 0
total_elements = []
total_complete_els = []
for filename in os.listdir(input_directory):
if filename.endswith('.csv'):
input_file = os.path.join(input_directory, filename)
output_file = os.path.join(output_directory, f'{prompt_name}_hashed_{filename}')
output_file_short = os.path.join(output_directory_short, f'{prompt_name}_hashed_{filename}')
# Open the CSV file
with open(input_file, 'r') as file:
reader = csv.reader(file)
header = next(reader) # Get the header row
# Specify the columns to use for generating the hash
# column_indices = [header.index('column1'), header.index('column2'), ...]
# Check if the "hashkey" column already exists
if "hashkey" in header:
hashkey_index = header.index("hashkey")
else:
hashkey_index = None
# Create a new CSV file with the "hashkey" column
with open(output_file, 'w') as output_file, open(output_file_short, 'w') as output_file_short:
writer = csv.writer(output_file)
writer2 = csv.writer(output_file_short)
# Write the header row with the "hashkey" column added or replaced
if hashkey_index is None:
writer.writerow(['hashkey'] + header)
writer2.writerow(['hashkey', "内容"])
else:
writer.writerow([header[i] if i != hashkey_index else "hashkey" for i in range(len(header))])
writer2.writerow([header[i] if i != hashkey_index else "hashkey" for i in range(len(header))])
# Process each row and generate the hash
count = 0
for row in reader:
if check_text(row[5]):
count = count + 1
hash_value = generate_hash(row)
if hashkey_index is None:
writer.writerow([hash_value] + row)
else:
row[hashkey_index] = hash_value
writer.writerow(row)
writer2.writerow([hash_value, row[5]])
total_elements.append([hash_value, "", row[5]])
model_name = filename.split('.')[2]+"."+filename.split('.')[3]
total_complete_els.append([hash_value,model_name]+row)
total_count = total_count + count
print(f"Processed file: {filename}, count: {count}")
print(f"total count: {total_count}")
lists = split_list_n(total_elements, split_n)
encoding = 'utf-8'
output_file_complete = os.path.join(output_directory_human, f'{prompt_name}_hashed_all.csv')
with open(output_file_complete, 'w', encoding=encoding) as output_file_complete:
writer = csv.writer(output_file_complete)
writer.writerow(['hashkey','model'] + header)
writer.writerows(total_complete_els)
for index, l in enumerate(lists):
output_file_short = os.path.join(output_directory_human, f'{prompt_name}_hashed_{index}_{encoding}.csv')
with open(output_file_short, 'w', encoding=encoding) as output_file_short:
writer = csv.writer(output_file_short)
writer.writerow(['标号(请勿修改)', '分数', "内容"])
writer.writerows(l)
print(f"Processed file: hashed_{index}.csv, count: {len(l)}")
directory_path = './out/'+prompt_name
output_directory = './hashed_out'
output_directory_short = './hashed_out_short'
output_directory_human = './human_dojo_text'
split_n = 3
process_csv_files(directory_path, output_directory, output_directory_short, output_directory_human, split_n)