Skip to content

Commit

Permalink
Add data cleaning scenario
Browse files Browse the repository at this point in the history
  • Loading branch information
xJoskiy committed Jan 5, 2025
1 parent 4b008fd commit 19defcb
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/datasets/taxes_2.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ Texas,1000,0.15
Texas,2000,0.25
Texas,3000,0.3
Texas,5000,0.05
Texas,6000,0.04
Texas,7000,0.03
90 changes: 90 additions & 0 deletions examples/expert/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import desbordante as db
import networkx as nx
import matplotlib.pyplot as plt
import time


class DataCleaner:
def __init__(self, violations):
self.graph = {}
for v1, v2 in violations:
self.graph[v1] = self.graph.get(v1, []) + [v2]
self.graph[v2] = self.graph.get(v2, []) + [v1]
self.nodes = list(self.graph.keys())


def __remove_most_common(self):
max_key = self.nodes[0]
max_len = len(self.graph[max_key])
for key, val in self.graph.items():
if len(val) > max_len:
max_key = key
max_len = len(val)

for node in self.graph[max_key]:
self.graph[node].remove(max_key)
self.graph.pop(max_key)
self.nodes.remove(max_key)

def __is_connected(self):
if len(self.graph) == 0:
raise ValueError("Graph is empty")

nodes = list(self.graph.keys())
visited = dict.fromkeys(nodes, False)
stack = [nodes[0]]
while len(stack) != 0:
cur = stack.pop()
if not visited[cur]:
neighbours = self.graph[cur]
stack.extend(neighbours)
visited[cur] = True

return all([val for _, val in visited.items()])

def clean(self, violations):
while self.__is_connected():
self.__remove_most_common()

def draw(self):
G = nx.Graph()
G.add_nodes_from(self.nodes)
for node, neighbours in self.graph.items():
[G.add_edge(node, neighbour) for neighbour in neighbours]
print(self.graph)
nx.draw(G, with_labels=True)
plt.show(block=True)


def verify_dc(verificator, dc):
print("Algo execution start")

algo_start = time.time()
verificator.execute(denial_constraint=dc)

print("Algo execution end")

result = verificator.dc_holds()
total_time = time.time() - algo_start

print("DC " + dc + " holds: " + str(result))
print("Total time elapsed: " + str(total_time))


def main():
table = '../datasets/taxes_2.csv'
dc = "!(t.0 == s.0 and t.1 > s.1 and t.2 < s.2)"

verificator = db.dc_verification.algorithms.Default()
verificator.load_data(table=(table, ',', True))

verify_dc(verificator, dc)
violations = verificator.get_violations()
cleaner = DataCleaner(violations)

cleaner.draw()
cleaner.clean(violations)
cleaner.draw()

if __name__ == "__main__":
main()

0 comments on commit 19defcb

Please sign in to comment.