diff --git a/hypernetx/classes/factory.py b/hypernetx/classes/factory.py index e1694770..c3f99f52 100644 --- a/hypernetx/classes/factory.py +++ b/hypernetx/classes/factory.py @@ -1,29 +1,10 @@ import pandas as pd -import numpy as np -from hypernetx.classes.helpers import dict_depth - - -# In[ ]: -# ------------------------------------------------------------------------------------------------- -# Individual factory methods for property stores -# ------------------------------------------------------------------------------------------------- - - -def remove_property_store_duplicates(PS, default_uid_col_names, aggregation_methods={}): - agg_methods = {} - for col in PS.columns: - if col not in aggregation_methods: - agg_methods[col] = "first" - else: - agg_methods[col] = aggregation_methods[col] - return PS.groupby(level=default_uid_col_names).agg(agg_methods) - - -### Alternate code for creating dataframe for PS -import ast, json def mkdict(x): + # function to create a dictionary from object x if it is not already a dicitonary. + import ast, json + if isinstance(x, dict): return x else: @@ -50,136 +31,61 @@ def create_df( misc_properties_col=None, aggregation_methods=None, ): + if not isinstance(dfp, pd.DataFrame): raise TypeError("method requires a Pandas DataFrame") else: - # dfp = deepcopy(properties) ### not sure if this is wise - + # checks if the use index variable is called. if it is then use the existing indices. if it is not then an index is set based on the uid columns. if use_index == False: + # if uid cols are specified make those columns the index columns if uid_cols != None: + # create chk function to check if the column specified is a string. if it is not a string then it assumes it is an integer and grabs that columns name. chk = lambda c: c if isinstance(c, str) else dfp.columns[c] + # set indices using the column names in uid_cols using the chk function. dfp = dfp.set_index([chk(c) for c in uid_cols]) - else: + else: # if uid_cols are not specified then assume the first one or two columns (depending on level) are the index columns and set the index. if level == 2: - dfp = dfp.set_index(dfp.columns[0], dfp.columns[1]) + dfp = dfp.set_index([dfp.columns[0], dfp.columns[1]]) else: - dfp = dfp.set_index(dfp.columns[0]) - - if ( - misc_properties_col in dfp.columns - and misc_properties_col != "misc_properties" - ): - dfp = dfp.rename(columns={misc_properties_col: "misc_properties"}) + dfp = dfp.set_index([dfp.columns[0]]) + + # if the misc prop col is in the column names + if misc_properties_col in dfp.columns: + # rename the misc properties column to the default name if it isn't + if misc_properties_col != "misc_properties": + dfp = dfp.rename(columns={misc_properties_col: "misc_properties"}) + # force misc properties to be a dictionary if it is not. dfp.misc_properties = dfp.misc_properties.map(mkdict) - else: + else: # if the column is not specified then create the misc properties column of empty dicitonaries. dfp["misc_properties"] = [{} for row in dfp.index] + # check if weight property column name was specified. if weight_prop in dfp.columns: + # if it was specified and it exists then rename to default weight name and fill in the NA weights with the default. dfp = dfp.rename(columns={weight_prop: "weight"}) dfp = dfp.fillna({"weight": default_weight}) + # if weight column is not None and the weight column name was not in the column names then check in the misc properties. elif weight_prop is not None: def grabweight(cell): + # function to grab weights from the misc properties column. if isinstance(cell, dict): return cell.get(weight_prop, default_weight) else: return default_weight + # set the weight column to the weights grabbed from the misc properties dictionary (if any). dfp["weight"] = dfp["misc_properties"].map(grabweight) + # reorder columns in standard order cols = [c for c in dfp.columns if c not in ["weight", "misc_properties"]] dfp = dfp[["weight"] + cols + ["misc_properties"]] + + # remove duplicate indices and aggregate using aggregation methods specified. dfp = dfp[~dfp.index.duplicated(keep="first")] return dfp -# def create_df(properties, uid_cols, use_indices, -# default_uid_col_names, weight_prop_col, -# misc_prop_col, default_weight, aggregation_methods): - -# #get length of dataframe once to be used throughout this function. -# length_of_dataframe = len(properties) - -# #get column names if integer was provided instead -# if isinstance(weight_prop_col, int): -# weight_prop_col = properties.columns[weight_prop_col] -# if isinstance(misc_prop_col, int): -# misc_prop_col = properties.columns[misc_prop_col] - -# #get list of all column names in properties dataframe -# column_names = list(properties.columns) - - -# # set weight column code: -# # default to use weight column if it exists before looking for default weight array or in misc properties column. -# if weight_prop_col in column_names: -# #do nothing since this is the format we expect by default. -# pass -# #check to see if an array of weights was provided to use for weights column -# elif not isinstance(default_weight, int) and not isinstance(default_weight, float): -# properties[weight_prop_col] = default_weight - -# #check if the weight column name exists in the misc properties. -# elif misc_prop_col in column_names: #check if misc properties exists -# #check if weight_prop_col is a key in any of the misc properties dicitonary. -# if any(weight_prop_col in misc_dict for misc_dict in properties[misc_prop_col]): -# #create list of cell weights from misc properties dictionaries and use default value if not in keys -# weights_from_misc_dicts = [] -# for misc_dict in properties[misc_prop_col]: -# if weight_prop_col in misc_dict: -# weights_from_misc_dicts.append(misc_dict[weight_prop_col]) -# else: -# weights_from_misc_dicts.append(default_weight) -# properties[weight_prop_col] = weights_from_misc_dicts - -# #if not provided anywhere then add in as default value -# else: -# properties[weight_prop_col] = [default_weight]*length_of_dataframe - -# #rename the columns where needed -# #start by defining dictionary of column renaming with uid columns. -# if not use_indices: #include uid columns if they are not indices. -# col_rename_dict = {uid_cols[i]: default_uid_col_names[i] for i in range(len(uid_cols))} #renaming dictionary -# else: -# col_rename_dict = {} -# #add weight column renaming -# col_rename_dict[weight_prop_col] = 'weight' -# #set misc properties column if not already provided and if set then update renaming dictionary. -# if misc_prop_col not in column_names: -# properties['misc_properties'] = [{}]*length_of_dataframe -# else: -# col_rename_dict[misc_prop_col] = 'misc_properties' -# #rename the columns -# properties.rename(columns = col_rename_dict, inplace = True) #rename the columns - - -# #set index for dataframe using the default uid column names that are dependent on the level if indices flag not on. -# if not use_indices: -# properties = properties.set_index(default_uid_col_names) -# else: #otherwise just rename the incides to the default names. -# properties.index.names = default_uid_col_names - - -# #remove any NaN values or missing values in weight column -# properties['weight'].fillna(default_weight, inplace = True) - - -# # remove any duplicate indices and combine using aggregation methods (defaults to 'first' if none provided). -# properties = remove_property_store_duplicates(properties, default_uid_col_names, aggregation_methods = aggregation_methods) - - -# #reorder columns to have properties last -# # Get the column names and the specific column -# specific_col = 'misc_properties' -# # Create a new order for the columns -# updated_column_names = list(properties.columns) -# new_order = [col for col in updated_column_names if col != specific_col] + [specific_col] -# # Reorder the dataframe using reindex -# properties = properties.reindex(columns=new_order) - -# return properties - - def dataframe_factory_method( DF, level, @@ -235,43 +141,10 @@ def dataframe_factory_method( PS = None else: - if use_indices: - uid_cols = DF.index.names - else: - # uid column name setting if they are not provided - if ( - uid_cols is None - ): # if none are provided set to the names of the first or first two columns depending on level - if level == 0 or level == 1: - uid_cols = [DF.columns[0]] - elif level == 2: - uid_cols = [DF.columns[0], DF.columns[1]] - - # get column names if integer was provided instead and create new uid_cols with string names. - uid_cols_to_str = [] - for col in uid_cols: - if isinstance(col, int): - uid_cols_to_str.append(DF.columns[col]) - else: - uid_cols_to_str.append(col) - uid_cols = uid_cols_to_str - - # set default uid column name(s) - if level == 0 or level == 1: - default_uid_col_names = ["uid"] - elif level == 2: - default_uid_col_names = ["edges", "nodes"] - - # PS = create_df(DF, uid_cols = uid_cols, use_indices = use_indices, - # default_uid_col_names = default_uid_col_names, - # weight_prop_col = weight_col, - # misc_prop_col = misc_properties_col, - # default_weight = default_weight, - # aggregation_methods = aggregate_by) - PS = create_df( DF, uid_cols=uid_cols, + level=level, use_index=use_indices, weight_prop=weight_col, misc_properties_col=misc_properties_col, @@ -339,23 +212,27 @@ def dict_factory_method( DF = None # if the dictionary data provided is for the setsystem (incidence data) elif level == 2: - # explode list of lists into incidence pairs as a pandas dataframe using pandas series explode. DF = pd.DataFrame(pd.Series(D).explode()).reset_index() # rename columns to correct column names for edges and nodes DF = DF.rename(columns=dict(zip(DF.columns, ["edges", "nodes"]))) - # if attributes are stored on the dictionary (ie, it has a depth greater than 2) - if dict_depth(D) > 2: - attribute_data = [] - for _, incidence_pair in DF.iterrows(): - edge, node = incidence_pair + attribute_data = {weight_col: [], misc_properties_col: []} + for _, incidence_pair in DF.iterrows(): + edge, node = incidence_pair + if isinstance(D[edge], dict): attributes_of_incidence_pair = D[edge][node] - attribute_data.append(attributes_of_incidence_pair) - attribute_df = pd.DataFrame(attribute_data) - DF = pd.concat([DF, attribute_df], axis=1) + if weight_col in attributes_of_incidence_pair: + weight_val = attributes_of_incidence_pair.pop(weight_col) + attribute_data[weight_col] += [weight_val] + else: + attribute_data[weight_col] += [default_weight] + attribute_data[misc_properties_col] += [attributes_of_incidence_pair] + attribute_df = pd.DataFrame(attribute_data) + DF = pd.concat([DF, attribute_df], axis=1) - else: + # id the dataeframe is for edges or nodes. + elif level == 1 or level == 0: attribute_data = [] for key in D: attributes_of_key = D[key] @@ -452,163 +329,3 @@ def list_factory_method( ) return PS - - -""" -# In[ ]: testing code -# Only runs if running from this file (This will show basic examples and testing of the code) - - -if __name__ == "__main__": - - run_list_example = False - if run_list_example: - - list_of_iterables = [[1, 1, 2], {1, 2}, {1, 2, 3}] - display(list_of_iterables) - - IPS = list_factory_method(list_of_iterables, level = 2, - aggregate_by = {'weight': 'sum'}) - display(IPS) - print('-'*100) - - - - run_simple_dict_example = True - if run_simple_dict_example: - - cell_dict = {'e1':[1,2],'e2':[1,2],'e3':[1,2,3]} - - print('Provided Dataframes') - print('-'*100) - display(cell_dict) - - print('\n \nRestructured Dataframes using single factory method for property store repeated') - print('-'*100) - - IPS = dict_factory_method(cell_dict, level = 2) - - display(IPS) - print('-'*100) - - - run_dict_example = True - if run_dict_example: - - cell_prop_dict = {'e1':{ 1: {'w':0.5, 'name': 'related_to'}, - 2: {'w':0.1, 'name': 'related_to','startdate': '05.13.2020'}}, - 'e2':{ 1: {'w':0.52, 'name': 'owned_by'}, - 2: {'w':0.2}}, - 'e3':{ 1: {'w':0.5, 'name': 'related_to'}, - 2: {'w':0.2, 'name': 'owner_of'}, - 3: {'w':1, 'type': 'relationship'}}} - - edge_prop_dict = {'e1': {'number': 1}, - 'e2': {'number': 2}, - 'e3': {'number': 3}} - - print('Provided Dataframes') - print('-'*100) - display(cell_prop_dict) - - print('\n \nRestructured Dataframes using single factory method for property store repeated') - print('-'*100) - - IPS = dict_factory_method(cell_prop_dict, level = 2, weight_col = 'w') - display(IPS) - - - EPS = dict_factory_method(edge_prop_dict, level = 0) - display(EPS) - - - NPS = dict_factory_method(None, level = 1, weight_col = 'w') - display(NPS) - print('-'*100) - - - run_simple_dataframe_example = False - if run_simple_dataframe_example: - - incidence_dataframe = pd.DataFrame({'e': ['a', 'a', 'a', 'b', 'c', 'c'], 'n': [1, 1, 2, 3, 2, 3],}) - - - print('Provided Dataframes') - print('-'*100) - display(incidence_dataframe) - - - - print('\n \nRestructured Dataframes using single factory method for property store repeated') - print('-'*100) - - - - IPS = dataframe_factory_method(incidence_dataframe, level = 2, - uid_cols = ['e', 'n'], - aggregate_by = {'weight': 'sum'},) - IS = IPS.index - - display(IS) - display(IPS) - - EPS = dataframe_factory_method(None, level = 0) - display(EPS) - - NPS = dataframe_factory_method(None, level = 1, uid_cols = ['nodes']) - display(NPS) - print('-'*100) - - - run_dataframe_example = True - if run_dataframe_example: - print('') - print('='*100) - print('='*100) - print('='*100) - print('') - - cell_prop_dataframe = pd.DataFrame({'E': ['a', 'a', 'a', 'b', 'c', 'c'], 'nodes': [1, 1, 2, 3, 2, 3], - 'color': ['red', 'red', 'red', 'red', 'red', 'blue'], - 'other_properties': [{}, {}, {'weight': 5}, {'time': 3}, {}, {}]}) - - edge_prop_dataframe = pd.DataFrame({'edges': ['a', 'b', 'c'], - 'strength': [2, np.nan, 3]}) - - node_prop_dataframe = pd.DataFrame({'N': [1], - 'temperature': [60]}) - node_prop_dataframe.set_index(['N'], inplace = True) - - print(list(node_prop_dataframe.columns)) - - print('Provided Dataframes') - print('-'*100) - display(cell_prop_dataframe) - display(edge_prop_dataframe) - display(node_prop_dataframe) - - print('\n \nRestructured Dataframes using single factory method for property store repeated') - print('-'*100) - - - IPS = dataframe_factory_method(cell_prop_dataframe, level = 2, - uid_cols = ['E', 'nodes'], - misc_properties_col = 'other_properties', - aggregate_by = {'weight': 'sum'},) - IS = IPS.index - - display(IS) - - display(IPS) - - - EPS = dataframe_factory_method(edge_prop_dataframe, level = 0, - weight_col = 1, uid_cols = [0]) - display(EPS) - - - NPS = dataframe_factory_method(node_prop_dataframe, level = 1, - use_indices = True) - display(NPS) - print('-'*100) -"""