-
Notifications
You must be signed in to change notification settings - Fork 0
/
db_explorer.py
186 lines (147 loc) · 6.89 KB
/
db_explorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import pandas as pd
from tqdm import tqdm
class DBExplorer(object):
"""Class that takes a connection to a database and provides methods
to get a quick and dirty exploratory view of what the db is looking
like.
Args:
conn(pyobdc.connection): The connection to the database.
"""
def __init__(self, conn, indexing=True):
self.conn = conn
self.cursor = self.conn.cursor()
self.create_tname_lists()
# Check for table shapes.
if indexing:
try:
self.shape_df = pd.read_csv("./resources/generated/table_shapes.csv")
print("SUCCESS: Table shape files loaded!")
except FileNotFoundError:
print("HEADSUP: Database shape file not found. File will now be generated.")
print("HEADSUP: This operation may take some time, depending on the size of the db. Grab a coffee." )
self.shape_df = self.make_table_shape_table()
# Check for view shapes.
try:
self.views_df = pd.read_csv("./resources/generated/views_shapes.csv")
print("SUCCESS: Views Table shape files loaded!")
except FileNotFoundError:
print("HEADSUP: Database views shape file not found. File will now be generated.")
print("HEADSUP: This operation may take some time, depending on the size of the db. Grab another coffee." )
self.views_shape_df = self.make_table_shape_table(views=True)
def create_tname_lists(self):
"""Sets self.base_table_names and self.view_table_names.
These two lists act as an inventory of the different tables in
the db that this calss is connected to."""
print("Indexing table names...")
QUERY = """SELECT *
from information_schema.tables;
"""
self.cursor.execute(QUERY)
self.base_table_names = []
self.view_table_names = []
for table in self.cursor.fetchall():
if table[3] == "BASE TABLE":
self.base_table_names.append(table[2])
else:
self.view_table_names.append(table[2])
print("Table names indexed...")
def make_table_shape_table(self, views=False):
"""Creates a MetaData File Called: table_shapes.csv which
contains all the information about the number of fields
and the number of entries in the db.
Args:
optional(views): Perform this operation for the views in the db, rather than the tables.
Returns:
out(pd.DataFrame): The shape dataframe (is set as either self.shape_df or self.view_shape_df)
"""
if views:
tlist = self.view_table_names
out_fname = "table_shapes.csv"
else:
tlist = self.base_table_names
out_fname = "views_table_shapes.csv"
names = []
entries = []
headers = []
for tname in tqdm(tlist):
QUERY = f"""SELECT * from {tname}"""
df = pd.read_sql(QUERY, self.conn)
names.append(tname)
entries.append(df.shape[0])
headers.append(df.shape[1])
df_dict = {"names": names,
"entries": entries,
"headers": headers}
out = pd.DataFrame(df_dict)
out.to_csv(f"./resources/generated/{out_fname}")
print("HEADSUP: table_shapes.csv written to folder: './resources/generated/'.")
return out
def make_key_data_table(self):
"""Creates a MetaData File Called: key_info.csv which
contains all of the information about the fields in the db,
how often those fields appear (if in multiple tables),
and how many entries correspond to those keys.
NOTE: Only does this wich actual tables, not views.
"""
key_info_dict = {
"name": [],
"appears_in_multiple": [],
"appears_in_tables": [],
"appears_in_count": [],
"total_entires": [],
"dtype": []
}
for tname in tqdm(self.base_table_names):
QUERY = f"""SELECT * from {tname}"""
df = pd.read_sql(QUERY, self.conn)
for col in df.columns:
if col not in key_info_dict["name"]:
key_info_dict["name"].append(col)
key_info_dict["appears_in_multiple"].append(False)
key_info_dict["appears_in_tables"].append([tname])
key_info_dict["appears_in_count"].append(1)
key_info_dict["total_entires"].append(df.shape[0])
key_info_dict["dtype"].append(df.dtypes[col])
else:
indx = key_info_dict["name"].index(col)
key_info_dict["appears_in_multiple"][indx] = True
key_info_dict["appears_in_tables"][indx].append(tname)
key_info_dict["appears_in_count"][indx] += 1
key_info_dict["total_entires"][indx] += df.shape[0]
out = pd.DataFrame(key_info_dict)
out.to_csv("./resources/generated/key_info.csv")
print("HEADSUP: key_info.csv written to folder: './resources/generated/'.")
return out
def read_table_as_dataframe(self, tname, verbose=True):
"""Reads a table in the database in as a pandas DataFrame
Args:
tname(str): The name of the table to pull.
Returns:
df(pandas.DataFrame): A Dataframe object of the sql table.
"""
if verbose:
print(f"Pulling table: {tname}... ...")
if tname in self.base_table_names:
QUERY = f"""SELECT * from {tname}"""
df = pd.read_sql(QUERY, self.conn)
if verbose:
print(f"Table {tname} pulled!")
return df
if verbose:
print(f"Table name specificed: {tname} is not currently within the database.. ..")
# Return empty df if table unable to be pulled.
return pd.DataFrame()
def check_for_join_options(self, tname1, tname2):
"""Checks if two databases have overlapping keys and returns the keys that the
tables could be joined on.
Args:
tname1(str): The name of first table to check.
tname2(str): The name of the second table to check.
Returns:
join_keys(list): A list of keys to perform the join on (empty if there are none).
"""
df1 = self.read_table_as_dataframe(tname1)
df2 = self.read_table_as_dataframe(tname2)
if not df1.empty and not df2.empty:
return [elm for elm in df1.columns if elm in df2.columns]
print("One or more of the tables specified is not present in the db. Join keys could not be generated.")