diff --git a/scripts/datamanip/GTVerifier.py b/scripts/datamanip/GTVerifier.py new file mode 100644 index 000000000..7430ae4c5 --- /dev/null +++ b/scripts/datamanip/GTVerifier.py @@ -0,0 +1,149 @@ +import sys +import re +import argparse +from sklearn.metrics import pairwise_distances +import parse_common as pc + + +#This is a very lame implementation of GT computation so that we avoid errors +#and can compare the GTs generated by more sophisticated methods. +#This code assumes that all the data fits in memory + +K = 100 +STR_NO_FILTER = "__NONE__" +dist_fn_sklearn_metric_map = {"l2" : "euclidean", + "cosine" : "cosine", + "mips" : "linear"} + + +ground_truth = {} + +class FilteredVectors: + def __init__(self, filter): + self.data = [] + self.filter = filter + self.orig_id_fil_id_map = {} + self.count = 0 + self.data_mat = None + + def add_vector(self, vector, orig_id): + self.data.append(vector) + self.orig_id_fil_id_map[self.count] = orig_id + self.count += 1 + + def assign_data_mat(self, data_mat): + self.data_mat = data_mat + + def __len__(self): + return len(self.data) if self.data_mat is None else self.data_mat.num_rows + + def __getitem__(self, index): + return self.data[index] + + def __str__(self) -> str: + return self.filter + " filters out " + str(self.count) + " vectors" + + +def parse_filter_line(line, delim_regex, line_number, filter_file_name): + line = line.strip() + filters = re.split(delim_regex, line) + if len(filters) == 0: + raise Exception(f"Filter line: {line} at line number: {line_number} in file {filter_file_name} does not have any filters") + return filters + +def process_filters(filter_file, data_mat, is_query): + filters = [] + filtered_vectors = {} + if filter_file is not None: + with open(filter_file, mode='r', encoding='UTF-8') as f: + line_num = 0 + for line in f: + filters_of_point = parse_filter_line(line, ',', line_num+1, filter_file) + filters.append(filters_of_point) + for filter in filters_of_point: + if filter not in filtered_vectors: + filtered_vectors[filter] = FilteredVectors(filter) + filtered_vectors[filter].add_vector(data_mat.get_vector(line_num), line_num) + line_num += 1 + else: + if not is_query: + filtered_vectors[STR_NO_FILTER] = FilteredVectors(STR_NO_FILTER) + filtered_vectors[STR_NO_FILTER].assign_data_mat(data_mat) + else: + #to simplify the code, we copy the queries one-by-one into the data list. + filtered_vectors[STR_NO_FILTER] = FilteredVectors(STR_NO_FILTER) + for i in range(data_mat.num_rows): + filtered_vectors[STR_NO_FILTER].add_vector(data_mat.get_vector(i), i) + + for filter in filtered_vectors: + print(filtered_vectors[filter]) + + unique_vector_ids = set() + for filtered_vector in filtered_vectors.values(): + unique_vector_ids.update(filtered_vector.orig_id_fil_id_map.values()) + all_ids = set(range(data_mat.num_rows)) + if len(all_ids.difference(unique_vector_ids)) > 0: + raise Exception(f"Missing vectors in filters: {all_ids.difference(unique_vector_ids)}") + + return filters, filtered_vectors + +def compute_filtered_gt(base_filtered_vector, query_filtered_vector, dist_fn): + print(f"Computing GT for filter: {query_filtered_vector.filter}, base count: {len(base_filtered_vector)}, query count: {len(query_filtered_vector)}") + for fil_q_id, query_vector in enumerate(query_filtered_vector.data): + qv = query_vector.reshape(1, -1) + if base_filtered_vector.data_mat is not None: + dist = pairwise_distances(base_filtered_vector.data_mat.data, qv, metric=dist_fn_sklearn_metric_map[dist_fn]) + else: + dist = pairwise_distances(base_filtered_vector.data, qv, metric=dist_fn_sklearn_metric_map[dist_fn]) + + index_dist_pairs = [(i, dist[i][0]) for i in range(len(dist))] + index_dist_pairs.sort(key=lambda x: x[1]) + k = min(K, len(index_dist_pairs)) + top_k_matches = index_dist_pairs[:k] + orig_query_id = query_filtered_vector.orig_id_fil_id_map[fil_q_id] + ground_truth[orig_query_id] = [] + for match in top_k_matches: + ground_truth[orig_query_id].append((base_filtered_vector.orig_id_fil_id_map[match[0]], match[1])) + +def compute_gt(base_filtered_vectors, query_filtered_vectors, dist_fn): + for query_filter in query_filtered_vectors.keys(): + if query_filter not in base_filtered_vectors: + print(f"Filter: {query_filter} in query does not exist in base") + continue + base_filtered_vector = base_filtered_vectors[query_filter] + query_filtered_vector = query_filtered_vectors[query_filter] + compute_filtered_gt(base_filtered_vector, query_filtered_vector, dist_fn) + + print(ground_truth) + +def main(args): + data_type_code, data_type_size = pc.get_data_type_code(args.data_type) + base_data = pc.DataMat(data_type_code, data_type_size) + base_data.load_bin(args.base_file) + + query_data = pc.DataMat(data_type_code, data_type_size) + query_data.load_bin(args.query_file) + + print("Grouping base vectors by filters\n") + base_filters, base_filtered_vectors = process_filters(args.base_filter_file, base_data, is_query=False) + print("Grouping query vectors by filters\n") + query_filters, query_filtered_vectors = process_filters(args.query_filter_file, query_data, is_query=True) + + compute_gt(base_filtered_vectors, query_filtered_vectors, args.dist_fn) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate GTs for a given base and query file and compare with any existing GT file. Assumes that there is a ton of memory to process the data!!!', prog='GTVerifier.py') + parser.add_argument('--base_file', type=str, help='Base file', required=True) + parser.add_argument('--query_file', type=str, help='Query file', required=True) + parser.add_argument('--base_filter_file', type=str, help='Base filter file', required=False) + parser.add_argument('--query_filter_file', type=str, help='Query filter file', required=False) + parser.add_argument('--output_gt_file', type=str, help='Output GT file', required=True) + parser.add_argument('--existing_gt_file', type=str, help='Existing GT file', required=False) + parser.add_argument('--dist_fn', type=str, help='GT format', required=True) + parser.add_argument('--data_type', type=str, help='GT format', required=True) + args = parser.parse_args() + + + main(args) + + diff --git a/scripts/datamanip/RemoveSelectedVectors.py b/scripts/datamanip/RemoveSelectedVectors.py new file mode 100644 index 000000000..8c2a35596 --- /dev/null +++ b/scripts/datamanip/RemoveSelectedVectors.py @@ -0,0 +1,39 @@ +import sys +import parse_common as pc + +def main(input_vector_file, data_type, ids_to_remove_file, output_file_prefix, filter_file_for_vectors): + data_type_code, data_type_size = pc.get_data_type_code(data_type) + vectors = pc.DataMat(data_type_code, data_type_size) + vectors.load_bin(input_vector_file) + vector_ids_to_remove = set() + with open(ids_to_remove_file, "r") as f: + for line in f: + vector_ids_to_remove.add(int(line.strip())) + + filters = [] + if filter_file_for_vectors is not None: + with open(filter_file_for_vectors, "r") as f: + for line in f: + filters.append(line.strip()) + + vectors.remove_rows(vector_ids_to_remove) + + output_bin_file = output_file_prefix + "_vecs.bin" + vectors.save_bin(output_bin_file) + print(f"Removed {len(vector_ids_to_remove)} vectors. Output written to {output_bin_file}") + + if len(filters) > 0: + output_filters_file = output_file_prefix + "_filters.txt" + output_filters = [filter for idx, filter in enumerate(filters) if idx not in vector_ids_to_remove] + with open(output_filters_file, "w") as f: + for output_filter in output_filters: + f.write(output_filter + "\n") + print(f"REmoved {len(vector_ids_to_remove)} filters. Output written to {output_filters_file}") + + +if __name__ == "__main__": + if len(sys.argv) != 5 and len(sys.argv) != 6: + print("Usage: []") + sys.exit(1) + else: + main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5] if len(sys.argv) > 5 else None) \ No newline at end of file diff --git a/scripts/datamanip/convert/AdsMultiFilterAnalyzer.py b/scripts/datamanip/convert/AdsMultiFilterAnalyzer.py new file mode 100644 index 000000000..1cb9f4157 --- /dev/null +++ b/scripts/datamanip/convert/AdsMultiFilterAnalyzer.py @@ -0,0 +1,189 @@ +import sys +import pandas as pd + +base_unique_filters = set() +query_unique_filters = set() + +base_joined_filters_inv_indx = {} +query_joined_filters_inv_indx = {} + +joined_filters_of_point = [] +joined_filters_of_query_point = [] + + +def create_joined_filters(per_category_label_lists): + assert(len(per_category_label_lists) == 3) + joined_filters = [] + for l1 in per_category_label_lists[0]: + for l2 in per_category_label_lists[1]: + for l3 in per_category_label_lists[2]: + joined_filters.append(f"{l1}_{l2}_{l3}") + return joined_filters + + +def parse_filter_line(line, separator, line_number, base_filter_file): + line = line.strip() + if line == '': + print(f"Empty line at line number: {line_number} in {base_filter_file}") + parts = line.split(separator) + if len(parts) != 3: + print(f"line: {line} at line number: {line_number} in {base_filter_file} does not have 3 parts when split by {separator}") + + return parts + +def append_category_name_to_labels(category_id, labels): + category_name = f"C{category_id+1}" + named_labels = [f"{category_name}={part}" for part in labels] + return named_labels + + + +def load_base_file_filters(base_filter_file): + with open(base_filter_file, mode='r', encoding='UTF-8') as f: + count = 0 + for line in f: + count += 1 + cs_filters_per_category = parse_filter_line(line, '|', count, base_filter_file) + per_category_label_lists = [] + for i in range(3): + cat_labels = append_category_name_to_labels(i, cs_filters_per_category[i].split(',')) + assert(len(cat_labels) > 0) + base_unique_filters.update(cat_labels) + per_category_label_lists.append(cat_labels) + + joined_filters = create_joined_filters(per_category_label_lists) + joined_filters_of_point.append([]) + for joined_filter in joined_filters: + joined_filters_of_point[count-1].append(joined_filter) + if joined_filter not in base_joined_filters_inv_indx: + base_joined_filters_inv_indx[joined_filter] = [] + base_joined_filters_inv_indx[joined_filter].append(count) + + if count % 500000 == 0: + print(f"Processed {count} lines in {base_filter_file}") + + print(f"Obtained {len(base_unique_filters)} distinct filters from {base_filter_file}, line count: {count}") + print(f"After joining number of filters is: {len(base_joined_filters_inv_indx)}") + + + +def load_query_file_filters(query_filter_file): + with open(query_filter_file, mode='r', encoding='UTF-8') as f: + count = 0 + for line in f: + count += 1 + cs_filters_per_category = parse_filter_line(line, '|', count, query_filter_file) + per_category_label_lists = [] + for i in range(3): + cat_labels = append_category_name_to_labels(i, cs_filters_per_category[i].split(',')) + assert(len(cat_labels) > 0) + query_unique_filters.update(cat_labels) + per_category_label_lists.append(cat_labels) + + joined_filters = create_joined_filters(per_category_label_lists) + joined_filters_of_query_point.append([]) + for joined_filter in joined_filters: + joined_filters_of_query_point[count-1].append(joined_filter) + if joined_filter not in query_joined_filters_inv_indx: + query_joined_filters_inv_indx[joined_filter] = [] + query_joined_filters_inv_indx[joined_filter].append(count) + + print(f"Obtained {len(query_unique_filters)} distinct filters from {query_filter_file}, line count = {count}") + print(f"After joining number of filters is: {len(query_joined_filters_inv_indx)}") + + + +def analyze(): + missing_query_filters = query_unique_filters.difference(base_unique_filters) + if len(missing_query_filters) > 0: + print(f"Warning: found the following query filters not in base:{missing_query_filters}") + + + bujf = set() + bujf.update(base_joined_filters_inv_indx.keys()) + qujf = set() + qujf.update(query_joined_filters_inv_indx.keys()) + missing_joined_filters = qujf.difference(bujf) + if len(missing_joined_filters) > 0: + print(f"Warning: found the following joined query filters not in base:{missing_joined_filters}") + + + mjqf_query_ids_map = {} + for filter in missing_joined_filters: + mjqf_query_ids_map[filter] = [] + for index, filters_of_point in enumerate(joined_filters_of_query_point): + if filter == filters_of_point[0]: + mjqf_query_ids_map[filter].append(index) + + with open('missing_joined_query_filters.txt', mode='w', encoding='UTF-8') as f: + for filter in mjqf_query_ids_map: + f.write(f"{filter}\t{len(mjqf_query_ids_map[filter])}\t{mjqf_query_ids_map[filter]}\n") + + + print(f"Number of unique base filters: {len(base_unique_filters)}" ) + print(f"Number of unique query filters: {len(query_unique_filters)}" ) + print(f"Number of joined base filters: {len(base_joined_filters_inv_indx)}" ) + print(f"Number of joined query filters: {len(query_joined_filters_inv_indx)}" ) + +def write_joined_filters(output_file_prefix): + base_joined_filters_file = output_file_prefix + '_base_joined_filters.txt' + + with open(base_joined_filters_file, mode='w', encoding='UTF-8') as f: + for filters_of_point in joined_filters_of_point: + str = ','.join([x for x in filters_of_point]) + f.write(f"{str}\n") + print(f"Base joined filters written to {base_joined_filters_file}") + + query_joined_filters_file = output_file_prefix + '_query_joined_filters.txt' + with open(query_joined_filters_file, mode='w', encoding='UTF-8') as f: + for filters_of_point in joined_filters_of_query_point: + str = ','.join([x for x in filters_of_point]) + f.write(f"{str}\n") + print(f"Query joined filters written to {query_joined_filters_file}") + + base_unique_filters_file = output_file_prefix + '_base_unique_filters.txt' + with open(base_unique_filters_file , mode='w', encoding='UTF-8') as f: + sorted_list = sorted(base_unique_filters) + for filter in sorted_list: + f.write(f"{filter}\n") + print(f"Base unique filters written to {base_unique_filters_file}") + + query_unique_filters_file = output_file_prefix + '_query_unique_filters.txt' + with open(query_unique_filters_file, mode='w', encoding='UTF-8') as f: + sorted_list = sorted(query_unique_filters) + for filter in sorted_list: + f.write(f"{filter}\n") + print(f"Query unique filters written to {query_unique_filters_file}") + + base_joined_unique_filters_file = output_file_prefix + '_base_joined_unique_filters.txt' + with open(base_joined_unique_filters_file, mode='w', encoding='UTF-8') as f: + sorted_list = sorted(base_joined_filters_inv_indx.keys()) + for filter in sorted_list: + f.write(f"{filter}\t{len(base_joined_filters_inv_indx[filter])}\n") + print(f"Base joined unique filters written to {base_joined_unique_filters_file}") + + query_unique_joined_filters_file = output_file_prefix + '_query_joined_unique_filters.txt' + with open(query_unique_joined_filters_file, mode='w', encoding='UTF-8') as f: + sorted_list = sorted(query_joined_filters_inv_indx.keys()) + for filter in sorted_list: + f.write(f"{filter}\t{len(query_joined_filters_inv_indx[filter])}\n") + + + + + +def main(base_filter_file, query_filter_file, output_path_prefix): + load_base_file_filters(base_filter_file) + load_query_file_filters(query_filter_file) + analyze() + write_joined_filters(output_path_prefix) + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: AdsMultiFilterAnalyzer.py ") + print("Both base file should have label categories separated by | and labels separated by commas") + print("Query file should have labels separated by |") + sys.exit(1) + else: + main(sys.argv[1], sys.argv[2], sys.argv[3]) \ No newline at end of file diff --git a/scripts/datamanip/convert/DiffUniqueJoinedFilters.py b/scripts/datamanip/convert/DiffUniqueJoinedFilters.py new file mode 100644 index 000000000..4f939e718 --- /dev/null +++ b/scripts/datamanip/convert/DiffUniqueJoinedFilters.py @@ -0,0 +1,22 @@ +import sys + +def main(): + base_unique_filters = set() + with open ("e:\\data\\adsmf\\test\\bujf_nc.txt", encoding='utf-8', mode="r") as f: + for line in f: + base_unique_filters.add(line.strip().split('\t')[0]) + + query_unique_filters = set() + with open ("e:\\data\\adsmf\\test\\qujf_nc.txt", encoding='utf-8', mode="r") as f: + for line in f: + query_unique_filters.add(line.strip().split('\t')[0]) + + missing = query_unique_filters.difference(base_unique_filters) + print(f"Missing filters: {missing}") + + + +if __name__ == "__main__": + main() + + \ No newline at end of file diff --git a/scripts/datamanip/convert/MultiFilterToSingleFilter.py b/scripts/datamanip/convert/MultiFilterToSingleFilter.py new file mode 100644 index 000000000..9f94a2dbf --- /dev/null +++ b/scripts/datamanip/convert/MultiFilterToSingleFilter.py @@ -0,0 +1,36 @@ +import sys + +def main(): + if len(sys.argv) != 4 and len(sys.argv) != 5: + print("Usage: {} [filter_separater(default ,)]".format(sys.argv[0])) + print("The program converts TSV file with vectors and multiple filters into vectors and single filters by repeating the vector for each filter. It assumes that the input file is a TSV file with no headers") + sys.exit(1) + + vector_col_id = int(sys.argv[2]) + filter_col_id = int(sys.argv[3]) + filter_separator = ',' if len(sys.argv) == 4 else sys.argv[4] + single_filter_file = sys.argv[1] + '.single_filter.txt' + + + with open(single_filter_file, 'w', -1, 'UTF-8') as writer: + with open(sys.argv[1], 'r', -1, 'UTF-8') as f: + for line in f: + line = line.strip() + if line == '': + continue + parts = line.split('\t') + vector = parts[vector_col_id] + filters = parts[filter_col_id].split(filter_separator) + for filter in filters: + filter = filter.replace(';',' ').replace('_', ' ') + writer.write('{}\t{}'.format(vector, filter)) + writer.write('\n') + + print("Output written to {}".format(single_filter_file)) + +if __name__ == "__main__": + print("In main") + main() + + + \ No newline at end of file diff --git a/scripts/datamanip/convert/PipeSeparatedFiltersToCommaSeparatedWithFilterName.py b/scripts/datamanip/convert/PipeSeparatedFiltersToCommaSeparatedWithFilterName.py new file mode 100644 index 000000000..134951a4d --- /dev/null +++ b/scripts/datamanip/convert/PipeSeparatedFiltersToCommaSeparatedWithFilterName.py @@ -0,0 +1,49 @@ +import sys + +def append_filter_name_to_filter_values(filter_name, filter_values_str): + filter_values = filter_values_str.split(',') + return [f"{filter_name}={filter_value}" for filter_value in filter_values] + + +def main(input_file, output_file, is_query_file): + with open(output_file, mode="w", encoding='UTF-8') as writer: + with open(input_file, mode='r', encoding='UTF-8') as f: + count = 0 + for line in f: + count += 1 + line = line.strip() + if line == '': + continue + + all_filters_str = None + if is_query_file: + parts = line.split('|') + if len(parts) != 3: + print(f"Invalid line: {line} at line number: {count}") + continue + all_filters_str = f"c1={parts[0]}&c2={parts[1]}&c3={parts[2]}" + else: + parts = line.split('|') + if len(parts) < 3: + print(f"Invalid line: {line} at line number: {count}") + continue + all_filters = [] + all_filters.extend(append_filter_name_to_filter_values("c1", parts[0])) + all_filters.extend(append_filter_name_to_filter_values("c2", parts[1])) + all_filters.extend(append_filter_name_to_filter_values("c3", parts[2])) + all_filters_str = ",".join(all_filters) + + writer.write(all_filters_str) + writer.write('\n') + + if count % 500000 == 0: + print("Processed {} lines".format(count)) + + print("Output written to {}".format(output_file)) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: ") + sys.exit(1) + else: + main(sys.argv[1], sys.argv[2], bool(sys.argv[3])) \ No newline at end of file diff --git a/scripts/datamanip/edgeclassifier/EdgeClassifier.py b/scripts/datamanip/edgeclassifier/EdgeClassifier.py new file mode 100644 index 000000000..4e6609244 --- /dev/null +++ b/scripts/datamanip/edgeclassifier/EdgeClassifier.py @@ -0,0 +1,134 @@ +import sys +import numpy as np +import queue + +class DistanceStats: + def __init__(self): + self.close = [] + self.far = [] + self.intermediate = [] + self.uncategorized = [] + self.graphlevel = 0 + + def categorize(self, min, max): + one_third = (max - min )/2.0 + two_third = 2 * one_third + for edge in self.uncategorized: + if edge[1] < one_third: + self.close.append(edge) + elif edge[1] < two_third: + self.intermediate.append(edge) + else: + self.far.append(edge) + self.uncategorized = [] + + +data = {} #id, vector map +graph = {} #id, [id1, id2, ...] map + +#a and b are tuples of the form (vector, norm) +def cosine(a, b): + return np.dot(a[0], b[0]) / (a[1]) * (b[1]) + +def l2_distance(a,b): + return np.linalg.norm(a[0] - b[0]) + + +def compute_distances(data, graph, start_node, dist_metric): + navigation_queue = queue.Queue() + navigation_queue.put((start_node, 0)) + classified_edges = {} + processed_nodes = set() + + max = sys.float_info.min + min = sys.float_info.max + while not navigation_queue.empty(): + current_node_and_level = navigation_queue.get() + current_node = current_node_and_level[0] + current_level = current_node_and_level[1] + + if current_node in processed_nodes: + continue + + classified_edges[current_node] = DistanceStats() + classified_edges[current_node].graphlevel = current_level + + for edge_node in graph[current_node]: + dist = dist_metric(data[current_node], data[edge_node]) + classified_edges[current_node].uncategorized.append((edge_node, dist)) + if dist > max: + max = dist + if dist < min: + min = dist + + #Because we know the graph has cycles, it is possible that we have already processed the edge node + if edge_node not in processed_nodes: + navigation_queue.put((edge_node, current_level + 1)) + + processed_nodes.add(current_node) + + return classified_edges, max, min + + +def main(input_file, start_node, dist_metric_str, out_stats_prefix): + #read tsv file with JSON data + firstline = True + with open(input_file, 'r') as f: + for line in f: + if firstline: + firstline = False + continue + line = line.strip().split('\t') + id = int(line[0].strip()) + vector = np.array([float(x) for x in line[1].strip('[').strip(']').split(',')]) + data[id] = (vector, np.linalg.norm(vector)) + graph[id] = np.array([int(x) for x in line[2].strip('[').strip(']').split(',')]) + f.close() + + assert(len(data) == len(graph)) + print("Read {} rows from {}. Data dim: {}".format(len(data), input_file, len(data[0][0]))) + + dist_metric = None + if dist_metric_str == 'l2': + dist_metric = l2_distance + elif dist_metric_str == 'cosine': + dist_metric = cosine + else: + raise ValueError("Invalid distance metric. Use l2 or cosine") + + classified_edges, max, min = compute_distances(data, graph, start_node, dist_metric) + print("Max distance: {}, Min distance: {}".format(max, min)) + + for node in classified_edges: + classified_edges[node].categorize(min, max) + + out_stats_file = out_stats_prefix + '_stats.tsv' + with open(out_stats_file, 'w') as f: + for node in classified_edges: + f.write(str(node) + '\t' + str(classified_edges[node].graphlevel) + + '\t' + str(len(classified_edges[node].close)) + + '\t' + str(len(classified_edges[node].intermediate)) + + '\t' + str(len(classified_edges[node].far)) + '\n') + f.close() + + # out_edges_file = out_stats_prefix + '_edges.tsv' + # with open(out_edges_file, 'w') as f: + # for node in classified_edges: + # for edge in classified_edges[node].close: + # f.write(str(node) + '\t' + str(edge[0]) + '\t' + str(edge[1]) + '\t' + 'close' + '\n') + # for edge in classified_edges[node].intermediate: + # f.write(str(node) + '\t' + str(edge[0]) + '\t' + str(edge[1]) + '\t' + 'intermediate' + '\n') + # for edge in classified_edges[node].far: + # f.write(str(node) + '\t' + str(edge[0]) + '\t' + str(edge[1]) + '\t' + 'far' + '\n') + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: python EdgeClassifier.py ") + sys.exit(1) + + if sys.argv[3] != 'l2' and sys.argv[3] != 'cosine': + print("Invalid distance metric. Use l2 or cosine") + sys.exit(1) + + + main(sys.argv[1], int(sys.argv[2].strip()), sys.argv[3], sys.argv[4]) \ No newline at end of file diff --git a/scripts/datamanip/wendiresultmapper/FileConverter.py b/scripts/datamanip/wendiresultmapper/FileConverter.py new file mode 100644 index 000000000..b17a9f539 --- /dev/null +++ b/scripts/datamanip/wendiresultmapper/FileConverter.py @@ -0,0 +1,83 @@ +#Important: +# We get both the data and query files from wendi in the format: +# hash \t vector \t filter for data and +# query id \t vector \t filter for query +# For the data file, we need to collate the vectors and filters +# based on the hash. So we end up with one vector and several +# filters, all associated with the same hash (i.e., point). +# For the query, we want to get the query id, vector, and filter +# as-is, because we want to do single-filter search. +import sys +vector_map = {} + +class VectorAndFilter: + def __init__(self, vector, filter): + self.vector = vector + self.filters = [] + self.filters.append(filter) + + def add_filter(self, filter): + self.filters.append(filter) + + +def main(input_file, output_file_prefix, is_query_file_str): + is_query_file = bool(is_query_file_str) + + with open(input_file, 'r', encoding='utf-8') as f: + count = 0 + line = f.readline().strip() + print(line) + while line: + pieces = line.split('\t') + assert(len(pieces) == 3) + hash = pieces[0].strip() + vector = pieces[1].strip() + filter = pieces[2].strip() + if hash not in vector_map: + vector_map[hash] = VectorAndFilter(vector, filter) + else: + vector_map[hash].add_filter(filter) + line = f.readline().strip() + if count % 100000 == 0 and count > 0: + print("Processed " + str(count) + " lines") + count += 1 + + print("Found " + str(len(vector_map)) + " unique vectors") + + vec_file = open(output_file_prefix + "_vectors.tsv", 'w', encoding='utf-8') + filter_file = open(output_file_prefix + "_filters.tsv", 'w', encoding='utf-8') + hash_vecid_map_file = open(output_file_prefix + "_hash_vecid_map.tsv", 'w', encoding='utf-8') + + if is_query_file : + count = 0 + for (vector_hash, vector_and_filter) in vector_map.items(): + for filter in vector_and_filter.filters: + hash_vecid_map_file.write(vector_hash +"\t" + str(count)) + hash_vecid_map_file.write("\n") + vec_file.write(vector_and_filter.vector.replace("|", "\t")) + vec_file.write("\n") + filter_file.write(filter) + filter_file.write("\n") + count += 1 + else: + count = 0 + for (vector_hash, vector_and_filter) in vector_map.items(): + hash_vecid_map_file.write(vector_hash +"\t" + str(count)) + hash_vecid_map_file.write("\n") + vec_file.write(vector_and_filter.vector.replace("|", "\t")) + vec_file.write("\n") + filter_file.write(",".join(vector_and_filter.filters)) + filter_file.write("\n") + + count += 1 + + vec_file.close() + filter_file.close() + hash_vecid_map_file.close() + print("Wrote " + str(count) + " vectors and metadata to " + ("query files" if is_query_file else "data files") + " with prefix: " + output_file_prefix) + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: python FileConverter.py ") + sys.exit(1) + main(sys.argv[1], sys.argv[2], sys.argv[3]) \ No newline at end of file diff --git a/scripts/datamanip/wendiresultmapper/ResultMapper.py b/scripts/datamanip/wendiresultmapper/ResultMapper.py new file mode 100644 index 000000000..dd5b189a8 --- /dev/null +++ b/scripts/datamanip/wendiresultmapper/ResultMapper.py @@ -0,0 +1,171 @@ +import sys +import pandas as pd +import linecache + +def print_one(table_name, df): + print(f"Table: {table_name}") + header = pd.DataFrame([df.columns], columns=df.columns) # Create a row with headers + first_row = df.iloc[[0]] # Get the first row + result = pd.concat([header, first_row]) + print(result.to_string(index=False)) + print("\n\n") + +def main(): + # origquery_file = "E:\\data\\FromWendi\\Q_vector_and_filter_20240724.tsv" + # query_pipe_sep_file = "E:\\data\\FromWendi\\only_query_pipe_sep.tsv" + # query_sf_file = "E:\\data\\FromWendi\\query_filters.sf.txt" + # results_file = "D:\\bin\\github\\wendi-fil-l2_L3_results.tsv" + #docs_master_file = "E:\\data\\FromWendi\\D_vector_and_filter_20240724.tsv" + #results_with_doc_ids_file = "E:\\data\\FromWendi\\result_list_with_doc_ids.tsv" + + qhash_id_map_file = "E:\\data\\FromWendi\\new\\query_hash_vecid_map.tsv" + query_sf_filters_file = "E:\\data\\FromWendi\\new\\query_filters.tsv" + dhash_docid_map_file = "E:\\data\\FromWendi\\new\\doc_hash_vecid_map.tsv" + query_results_raw_file = "D:\\bin\\github\\new-mainbranch-wendi-rslts_L0_results.tsv" + + final_results_file = "E:\\data\\FromWendi\\new\\final_results_file.tsv" + + + cn1 = ['query_hash', 'query_vec_id'] + qhash_id_map_df = pd.read_csv(qhash_id_map_file, sep='\t', header=None, names=cn1, encoding='utf-8') + print_one("QueryHash To QueryId", qhash_id_map_df) + + cn2 = ['filter'] + query_sf_filters_df = pd.read_csv(query_sf_filters_file, sep='\t', header=None, names=cn2, encoding='utf-8') + query_sf_filters_df.reset_index(drop=False, inplace=True) + query_sf_filters_df.rename(columns={'index': 'query_vec_id'}, inplace=True) + print_one("Query Filters", query_sf_filters_df) + + cn3 = ['doc_hash', 'doc_vec_id'] + dhash_docid_map_df = pd.read_csv(dhash_docid_map_file, sep='\t', header=None, names=cn3, encoding='utf-8') + print_one("DocHash To DocId", dhash_docid_map_df) + + # cn4 = ['query_vec_id', 'results'] + # query_results_raw_df = pd.read_csv(query_results_raw_file, sep='\t', header=None, names=cn4, encoding='utf-8') + # print_one("Query Results Raw", query_results_raw_df) + + # cn5 = ['query_vec_id', 'doc_vec_id', 'score', 'match_type'] + # processed_results_df = pd.DataFrame(columns=cn5) + + # for index, row in query_results_raw_df.iterrows(): + # result_str = row['results'] + # detailed_result_list = result_str.split('),') + # if index % 1000 == 0: + # print("Processing row: {}".format(index)) + # detailed_result_rows = {'query_vec_id': [], 'doc_vec_id': [], 'score': [], 'match_type': []} + # for detailed_result in detailed_result_list: + # detailed_result = detailed_result.strip('(').strip(')').strip() + # if detailed_result == '': + # continue + # result_id_score_match = detailed_result.split(',') + # detailed_result_rows['query_vec_id'].append(row['query_vec_id']) + # detailed_result_rows['doc_vec_id'].append(result_id_score_match[0]) + # detailed_result_rows['score'].append(result_id_score_match[1]) + # detailed_result_rows['match_type'].append(result_id_score_match[2]) + # processed_results_df = pd.concat([processed_results_df, pd.DataFrame(detailed_result_rows)], ignore_index=True) + # print_one("Processed Results", processed_results_df) + + # processed_results_df.to_csv("E:\\data\\FromWendi\\new\\results_with_query_and_docids.tsv", sep='\t', index=False) + + #Do the final merge between processed_results_df and dhash_docid_map_df + cn5 = ['query_vec_id', 'doc_vec_id', 'score', 'match_type'] + processed_results_df = pd.read_csv("E:\\data\\FromWendi\\new\\results_with_query_and_docids.tsv", names=cn5, sep='\t', encoding='utf-8') + print_one("Processed Results", processed_results_df) + + processed_results_with_filters = pd.merge(processed_results_df, query_sf_filters_df, on = 'query_vec_id', how='inner') + print_one("Results With Filters", processed_results_with_filters) + + results_with_query_hash = pd.merge(processed_results_with_filters, qhash_id_map_df, on = 'query_vec_id', how='inner') + final_results = pd.merge(results_with_query_hash, dhash_docid_map_df, on = 'doc_vec_id', how='inner') + final_results.to_csv("E:\\data\\FromWendi\\new\\final_results.tsv", sep='\t', index=False) + + + + + # # qhash, qid, filter + # qhash_qid_filter_df = pd.merge(qhash_id_map_df, query_sf_filters_df, on = 'query_vec_id', how='inner') + # # qhash, qid, filter, docid, score, matchtype + # qhash_qid_filter_docid_score_match_df = pd.merge(qhash_qid_filter_df, processed_results_df, on = 'query_vec_id', how='inner') + # # qhash, qid, filter, docid, score, matchtype, dochash + # qhash_qid_filter_docid_score_match_dochash_df = pd.merge(qhash_qid_filter_docid_score_match_df, dhash_docid_map_df, on = 'doc_vec_id', how='inner') + # qhash_qid_filter_docid_score_match_dochash_df.to_csv(final_results_file, sep='\t', index=False) + + + + + + + + + + + # cn1 = ['incoming_query_id', 'query_vec', 'labels'] + # origquery_df = pd.read_csv(origquery_file, sep='\t', header=None, names=cn1, encoding='utf-8') + + # cn2=["query_vec"] + # query_pipe_sep_df = pd.read_csv(query_pipe_sep_file, sep='\t', header=None, names=cn2, encoding='utf-8') + # query_pipe_sep_df.reset_index(drop=False, inplace=True) + # query_pipe_sep_df.rename(columns={'index': 'query_id'}, inplace=True) + + # cn3=["label"] + # query_sf_df = pd.read_csv(query_sf_file, sep='\t', header=None, names=cn3, encoding='utf-8') + # query_sf_df.reset_index(drop=False, inplace=True) + # query_sf_df.rename(columns={'index': 'query_id'}, inplace=True) + + # cn4=["query_id", "results"] + # results_df = pd.read_csv(results_file, sep='\t', header=None, names=cn4, encoding='utf-8') + # results_df.reset_index(drop=False) + + # #print column names of each dataframe + # print("Columns of ORIGINAL query file:{}".format(origquery_df.columns)) + # print("Columns of PROCESSED query file: {}".format(query_pipe_sep_df.columns)) + # print("Columns of QUERY FILTERS file: {}".format(query_sf_df.columns)) + # print("Columns of RESULTS file: {}".format(results_df.columns)) + + + # #merge the dataframes carefuly! + # query_tsv_orig_query_joined = pd.merge(origquery_df, query_pipe_sep_df, on = 'query_vec', how='inner') + # incoming_q_id_query_id = query_tsv_orig_query_joined[['query_id', 'incoming_query_id']] + # incoming_q_id_query_id_with_labels = pd.merge(incoming_q_id_query_id, query_sf_df, on = 'query_id', how='inner') + # incoming_q_id_query_id_with_labels_results = pd.merge(incoming_q_id_query_id_with_labels, results_df, on = 'query_id', how='inner') + + # print("Merged ORIGINAL query file WITH single filters file and results and obtained {} rows and these columns:{}".format(incoming_q_id_query_id_with_labels_results.shape[0], incoming_q_id_query_id_with_labels_results.columns)) + # print("Now processing the results to get the doc_ids, scores, and match types.") + + # final_result_list = pd.DataFrame(columns=['incoming_query_id', 'label', 'doc_id', 'score', 'match_type']) + # #loop through the dataframes + # for index, row in incoming_q_id_query_id_with_labels_results.iterrows(): + # result_str = row['results'] + # detailed_result_list = result_str.split('),') + # print("Process row: {} with query_id:{} and label: {}".format(index, row['incoming_query_id'], row['label'])) + # detailed_result_rows = {'incoming_query_id': [], 'label': [], 'doc_id': [], 'score': [], 'match_type': []} + # for detailed_result in detailed_result_list: + # detailed_result = detailed_result.strip('(').strip(')').strip() + # if detailed_result == '': + # continue + # result_id_score_match = detailed_result.split(',') + # #new_record = pd.DataFrame([{'incoming_query_id': row['incoming_query_id'], 'label': row['label'], 'doc_id': result_id_score_match[0], 'score': result_id_score_match[1], 'match_type': result_id_score_match[2]}]) + # detailed_result_rows['incoming_query_id'].append(row['incoming_query_id']) + # detailed_result_rows['label'].append(row['label']) + # detailed_result_rows['doc_id'].append(result_id_score_match[0]) + # detailed_result_rows['score'].append(result_id_score_match[1]) + # detailed_result_rows['match_type'].append(result_id_score_match[2]) + # final_result_list = pd.concat([final_result_list, pd.DataFrame(detailed_result_rows)], ignore_index=True) + + # final_result_list.to_csv(results_with_doc_ids_file, sep='\t', index=False) + # print("Obtained {} records after extracting results into separate rows. Saved to file {}.".format(final_result_list.shape[0], results_with_doc_ids_file)) + + # final_result_list = pd.read_csv(results_with_doc_ids_file, sep='\t', encoding='utf-8') + + # print("Reading docs master from: {}".format(docs_master_file)) + # docs_master = pd.read_csv(docs_master_file, sep='\t', usecols=[0], names=['doc_hash'], encoding='utf-8') + # docs_master.reset_index(drop=False, inplace=True) + # docs_master.rename(columns={'index': 'doc_id'}, inplace=True) + # print("Docs master has {} rows and these columns:{}".format(docs_master.shape[0], docs_master.columns)) + + # print("Merging final result set with docs_master with {} rows and these columns {}".format(docs_master.shape[0], docs_master.columns)) + # final_result_with_doc_hashes = pd.merge(final_result_list, docs_master, on = 'doc_id', how='inner') + # final_result_with_doc_hashes.to_csv("E:\\data\\FromWendi\\results_with_doc_hashes.tsv", sep='\t', index=False) + +if __name__ == '__main__': + main()