From d1da30dff874a08dce232067390f256074a37047 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Mon, 11 Mar 2024 17:08:27 -0300 Subject: [PATCH 01/29] add ApiBuilder class --- api/api_class.ipynb | 0 api/data/tables.json | 573 +- api/data/tables_pre.json | 392 ++ api/data/tesseract_schema.json | 4910 +++++++++++++++++ api/src/utils/api_data_request/api.py | 164 + .../utils/api_data_request/api_generator.py | 187 +- api/src/utils/app.py | 18 +- api/src/utils/preprocessors/text.py | 41 +- .../utils/table_selection/table_details.py | 57 +- 9 files changed, 5985 insertions(+), 357 deletions(-) create mode 100644 api/api_class.ipynb create mode 100644 api/data/tables_pre.json create mode 100644 api/data/tesseract_schema.json create mode 100644 api/src/utils/api_data_request/api.py diff --git a/api/api_class.ipynb b/api/api_class.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/api/data/tables.json b/api/data/tables.json index 2adf030..7c4a7d4 100644 --- a/api/data/tables.json +++ b/api/data/tables.json @@ -18,45 +18,76 @@ "description": "percentage change" } ], - "variables": [ - { - "name": "Year", - "description": "periodicity of the data", - "parent dimension": "Time", - "hierarchies": ["Year", "Month and Year"] - }, - { - "name": "Month and Year", - "description": "periodicity of the data with the format YYYYMM (example March of 2015 is 201503)", - "parent dimension": "Time", - "hierarchies": ["Year", "Month and Year"] - }, - { - "name": "Level 1.1", - "description": "most general level of products and services", - "parent dimension": "Products or Services", - "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] - }, - { - "name": "Level 2.2", - "parent dimension": "Products or Services", - "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] - }, - { - "name": "Level 3.3", - "parent dimension": "Products or Services", - "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] - }, - { - "name": "Level 4.4", - "parent dimension": "Products or Services", - "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] - }, - { - "name": "Level 5.5", - "description": "most detailed level of products and services", - "parent dimension": "Products or Services", - "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] + "dimensions": [ + { + "name": "Time", + "description": "Periodicity of the data (monthly or annual).", + "hierarchies": [ + { + "name": "Month and Year", + "description": "'Month and Year' has the format YYYYMM (example March of 2015 is 201503)", + "levels": [ + "Year", + "Month and Year" + ] + } + ] + }, + { + "name": "Products or Services", + "description": "contains levels of products and services", + "hierarchies": [ + { + "name": "Level 1.1", + "levels": [ + "Level 1.1", + "Level 2.2", + "Level 3.3", + "Level 4.4", + "Level 5.5" + ] + }, + { + "name": "Level 2.2", + "levels": [ + "Level 1.1", + "Level 2.2", + "Level 3.3", + "Level 4.4", + "Level 5.5" + ] + }, + { + "name": "Level 3.3", + "levels": [ + "Level 1.1", + "Level 2.2", + "Level 3.3", + "Level 4.4", + "Level 5.5" + ] + }, + { + "name": "Level 4.4", + "levels": [ + "Level 1.1", + "Level 2.2", + "Level 3.3", + "Level 4.4", + "Level 5.5" + ] + }, + { + "name": "Level 5.5", + "levels": [ + "Level 1.1", + "Level 2.2", + "Level 3.3", + "Level 4.4", + "Level 5.5" + ] + } + ] } ] }, @@ -67,55 +98,74 @@ "measures": [ { "name": "Millions Of Dollars", - "description": "value in millions of dollars of a certain shipment." + "description": "value in millions of dollars of a certain shipment." }, { "name": "Thousands Of Tons", "description": "weight in thousands of tons of a certain shipment." } ], - "variables": [ + "dimensions": [ { "name": "Year", - "description": "year", - "parent dimension": "Year", - "hierarchies": ["Year"] + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] }, { "name": "SCTG2", "description": "products based on SCTG classification (first level).", - "parent dimension": "SCTG2", - "hierarchies": ["SCTG2"] + "hierarchies": [ + { + "name": "SCTG2", + "levels": [ + "SCTG2" + ] + } + ] }, { "name": "Transportation Mode", "description": "mode of transportation or shipment.", - "parent dimension": "Transportation Mode", - "hierarchies": ["Transportation Mode"] - }, - { - "name": "Origin State", - "description": "Origin state", - "parent dimension": "Origin", - "hierarchies": ["Origin State", "Origin Region"] - }, - { - "name": "Origin Region", - "description": "Origin region", - "parent dimension": "Origin", - "hierarchies": ["Origin State", "Origin Region"] - }, - { - "name": "Destination State", - "description": "Destination state", - "parent dimension": "Destination", - "hierarchies": ["Destination State", "Destination Region"] - }, - { - "name": "Destination Region", - "description": "Destination region", - "parent dimension": "Destination", - "hierarchies": ["Destination State", "Destination Region"] + "hierarchies": [ + { + "name": "Transportation Mode", + "levels": [ + "Transportation Mode" + ] + } + ] + }, + { + "name": "Origin", + "description": "Origin region or state of the shipment", + "hierarchies": [ + { + "name": "Origin Region", + "levels": [ + "Origin State", + "Origin Region" + ] + } + ] + }, + { + "name": "Destination", + "description": "Destination region or state of the shipment", + "hierarchies": [ + { + "name": "Destination Region", + "levels": [ + "Destination State", + "Destination Region" + ] + } + ] } ] }, @@ -132,29 +182,42 @@ "name": "Default Rate" } ], - "variables": [ + "dimensions": [ { "name": "Year", - "parent dimension": "Year", - "hierarchies": ["Year"] - }, - { - "name": "State", - "description": "US states", - "parent dimension": "Geography", - "hierarchies": ["State", "County"] - }, - { - "name": "County", - "description": "US counties", - "parent dimension": "Geography", - "hierarchies": ["State", "County"] + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Geography", + "description": "geographical dimension of the data, either state or county level.", + "hierarchies": [ + { + "name": "County", + "levels": [ + "State", + "County" + ] + } + ] }, { "name": "OPEID", "description": "School name according to the Office of Postsecondary Education Identifier", - "parent dimension": "OPEID", - "hierarchies": ["OPEID"] + "hierarchies": [ + { + "name": "OPEID", + "levels": [ + "OPEID" + ] + } + ] } ] }, @@ -176,86 +239,94 @@ "name": "Average Age" } ], - "variables": [ + "dimensions": [ { "name": "Year", - "parent dimension": "Year", - "hierarchies": ["Year"] - }, - { - "name": "Nation", - "description": "national level data (USA)", - "parent dimension": "Geography", - "hierarchies": ["Nation", "State", "PUMA"] - }, - { - "name": "State", - "description": "US states", - "parent dimension": "Geography", - "hierarchies": ["Nation", "State", "PUMA"] - }, - { - "name": "PUMA", - "description": "US PUMAs (public use microdata areas)", - "parent dimension": "Geography", - "hierarchies": ["Nation", "State", "PUMA"] + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Geography", + "description": "geographical dimension of the data, either national, state or PUMA level.", + "hierarchies": [ + { + "name": "PUMA", + "levels": [ + "Nation", + "State", + "PUMA" + ] + } + ] }, { "name": "Age", - "parent dimension": "Age", - "hierarchies": ["Age"] + "hierarchies": [ + { + "name": "Age", + "levels": [ + "Age" + ] + } + ] }, { "name": "Gender", - "parent dimension": "Gender", - "hierarchies": ["Gender"] + "description": "Gender dimension (female or male).", + "hierarchies": [ + { + "name": "Gender", + "levels": [ + "Gender" + ] + } + ] }, { "name": "Race", - "parent dimension": "Race", - "hierarchies": ["Race"] - }, - { - "name": "Major Occupation Group", - "description": "PUMS Occupation most general classification", - "parent dimension": "PUMS Occupation", - "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] - }, - { - "name": "Minor Occupation Group", - "description": "PUMS minor Occupation classification", - "parent dimension": "PUMS Occupation", - "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] - }, - { - "name": "Broad Occupation", - "description": "PUMS broad Occupation classification", - "parent dimension": "PUMS Occupation", - "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] - }, - { - "name": "Detailed Occupation", - "description": "PUMS most detailed Occupation classification", - "parent dimension": "PUMS Occupation", - "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] - }, - { - "name": "Industry Sector", - "description": "PUMS industry sector classification (most general classification)", - "parent dimension": "PUMS Industry", - "hierarchies": ["Industry Sector", "Industry Sub-Sector", "Industry Group"] - }, - { - "name": "Industry Sub-Sector", - "description": "PUMS industry sub-sector classification", - "parent dimension": "PUMS Industry", - "hierarchies": ["Industry Sector", "Industry Sub-Sector", "Industry Group"] - }, - { - "name": "Industry Group", - "description": "PUMS industry group classification (most detailed classification)", - "parent dimension": "PUMS Industry", - "hierarchies": ["Industry Sector", "Industry Sub-Sector", "Industry Group"] + "hierarchies": [ + { + "name": "Race", + "levels": [ + "Race" + ] + } + ] + }, + { + "name": "PUMS Occupation", + "description": "PUMS Occupation classification", + "hierarchies": [ + { + "name": "Detailed Occupation", + "levels": [ + "Major Occupation Group", + "Minor Occupation Group", + "Broad Occupation", + "Detailed Occupation" + ] + } + ] + }, + { + "name": "PUMS Industry", + "description": "PUMS industry classification", + "hierarchies": [ + { + "name": "Industry Group", + "levels": [ + "Industry Sector", + "Industry Sub-Sector", + "Industry Group" + ] + } + ] } ] }, @@ -273,28 +344,52 @@ "description": "Contains the total votes in a certain state for a certain year." } ], - "variables": [ + "dimensions": [ { "name": "Candidate", - "parent dimension": "Candidate", - "hierarchies": ["Candidate"] + "description": "Name of Senate candidates.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] }, { "name": "State", - "description": "US states", - "parent dimension": "State", - "hierarchies": ["State"] + "hierarchies": [ + { + "name": "State", + "levels": [ + "State" + ] + } + ] }, { - "name": "Party", - "description": "Political party of the candidate", - "parent dimension": "Party", - "hierarchies": ["Party"] + "name": "Year", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] }, { - "name": "Year", - "parent dimension": "Year", - "hierarchies": ["Year"] + "name": "Party", + "description": "Political party to which each candidate belongs to.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] } ] }, @@ -312,34 +407,70 @@ "description": "Contains the total votes in a certain state for a certain year." } ], - "variables": [ + "dimensions": [ { "name": "Candidate", - "parent dimension": "Candidate", - "hierarchies": ["Candidate"] + "description": "Name of President candidates.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] + }, + { + "name": "Geography", + "hierarchies": [ + { + "name": "State", + "levels": [ + "State" + ] + }, + { + "name": "County", + "levels": [ + "State County", + "County" + ] + } + ] }, { - "name": "State", - "description": "US states", - "parent dimension": "Geography", - "hierarchies": ["State"] + "name": "Candidate", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] }, { - "name": "County", - "description": "US counties", - "parent dimension": "Geography", - "hierarchies": ["State County", "County"] + "name": "Year", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] }, { "name": "Party", - "description": "Political party of the candidate", - "parent dimension": "Party", - "hierarchies": ["Party"] - }, - { - "name": "Year", - "parent dimension": "Year", - "hierarchies": ["Year"] + "description": "Political party to which each candidate belongs to.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] } ] }, @@ -357,34 +488,54 @@ "description": "Contains the total votes in a certain state for a certain year." } ], - "variables": [ + "dimensions": [ { "name": "Candidate", - "parent dimension": "Candidate", - "hierarchies": ["Candidate"] - }, - { - "name": "State", - "description": "US states", - "parent dimension": "Geography", - "hierarchies": ["State", "Congressional District"] + "description": "Name of House candidates.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] + }, + { + "name": "Geography", + "description": "geographical dimension of the data, either state or congressional district level.", + "hierarchies": [ + { + "name": "Congressional District", + "levels": [ + "State", + "Congressional District" + ] + } + ] }, { - "name": "Congressional District", - "description": "US counties", - "parent dimension": "Geography", - "hierarchies": ["State", "Congressional District"] + "name": "Year", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] }, { "name": "Party", - "description": "Political party of the candidate", - "parent dimension": "Party", - "hierarchies": ["Party"] - }, - { - "name": "Year", - "parent dimension": "Year", - "hierarchies": ["Year"] + "description": "Political party to which each candidate belongs to.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] } ] } diff --git a/api/data/tables_pre.json b/api/data/tables_pre.json new file mode 100644 index 0000000..2adf030 --- /dev/null +++ b/api/data/tables_pre.json @@ -0,0 +1,392 @@ +{ + "tables": [ + { + "name": "Consumer Price Index - CPI", + "api": "Tesseract", + "description": "Table 'Consumer Price Index - CPI' has price index for products, encompassing over 200 expenditure categories grouped into major segments (food and beverages, housing, apparel, recreation, and other goods).", + "measures": [ + { + "name": "Consumer Price Index", + "description": "consumer price index" + }, + { + "name": "Standard Error", + "description": "standard error of CPI" + }, + { + "name": "Percent Change", + "description": "percentage change" + } + ], + "variables": [ + { + "name": "Year", + "description": "periodicity of the data", + "parent dimension": "Time", + "hierarchies": ["Year", "Month and Year"] + }, + { + "name": "Month and Year", + "description": "periodicity of the data with the format YYYYMM (example March of 2015 is 201503)", + "parent dimension": "Time", + "hierarchies": ["Year", "Month and Year"] + }, + { + "name": "Level 1.1", + "description": "most general level of products and services", + "parent dimension": "Products or Services", + "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] + }, + { + "name": "Level 2.2", + "parent dimension": "Products or Services", + "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] + }, + { + "name": "Level 3.3", + "parent dimension": "Products or Services", + "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] + }, + { + "name": "Level 4.4", + "parent dimension": "Products or Services", + "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] + }, + { + "name": "Level 5.5", + "description": "most detailed level of products and services", + "parent dimension": "Products or Services", + "hierarchies": ["Level 1.1", "Level 2.2", "Level 3.3", "Level 4.4", "Level 5.5"] + } + ] + }, + { + "name": "dot_faf", + "api": "Tesseract", + "description": "Table 'dot_faf' has freight movement among states and major metropolitan areas by all modes of transportation. Shows which goods are shipped from one region of the US to another region, according to type of commodity, mode of shipment, value, and weight.", + "measures": [ + { + "name": "Millions Of Dollars", + "description": "value in millions of dollars of a certain shipment." + }, + { + "name": "Thousands Of Tons", + "description": "weight in thousands of tons of a certain shipment." + } + ], + "variables": [ + { + "name": "Year", + "description": "year", + "parent dimension": "Year", + "hierarchies": ["Year"] + }, + { + "name": "SCTG2", + "description": "products based on SCTG classification (first level).", + "parent dimension": "SCTG2", + "hierarchies": ["SCTG2"] + }, + { + "name": "Transportation Mode", + "description": "mode of transportation or shipment.", + "parent dimension": "Transportation Mode", + "hierarchies": ["Transportation Mode"] + }, + { + "name": "Origin State", + "description": "Origin state", + "parent dimension": "Origin", + "hierarchies": ["Origin State", "Origin Region"] + }, + { + "name": "Origin Region", + "description": "Origin region", + "parent dimension": "Origin", + "hierarchies": ["Origin State", "Origin Region"] + }, + { + "name": "Destination State", + "description": "Destination state", + "parent dimension": "Destination", + "hierarchies": ["Destination State", "Destination Region"] + }, + { + "name": "Destination Region", + "description": "Destination region", + "parent dimension": "Destination", + "hierarchies": ["Destination State", "Destination Region"] + } + ] + }, + { + "name": "ed_defaults", + "api": "Tesseract", + "description": "Table `ed_defaults` has cohort default rates of schools.", + "measures": [ + { + "name": "Borrowers in Default", + "description": "Number of borrowers in default" + }, + { + "name": "Default Rate" + } + ], + "variables": [ + { + "name": "Year", + "parent dimension": "Year", + "hierarchies": ["Year"] + }, + { + "name": "State", + "description": "US states", + "parent dimension": "Geography", + "hierarchies": ["State", "County"] + }, + { + "name": "County", + "description": "US counties", + "parent dimension": "Geography", + "hierarchies": ["State", "County"] + }, + { + "name": "OPEID", + "description": "School name according to the Office of Postsecondary Education Identifier", + "parent dimension": "OPEID", + "hierarchies": ["OPEID"] + } + ] + }, + { + "name": "pums_5", + "api": "Mondrian", + "description": "Table 'pums_5' has data on total population and average wages by Year, Nation, State or PUMA, age, gender, race, PUMS occupation and PUMS industry. You can query any combination of these.", + "measures": [ + { + "name": "Average Wage" + }, + { + "name": "Total Population" + }, + { + "name": "Average Income" + }, + { + "name": "Average Age" + } + ], + "variables": [ + { + "name": "Year", + "parent dimension": "Year", + "hierarchies": ["Year"] + }, + { + "name": "Nation", + "description": "national level data (USA)", + "parent dimension": "Geography", + "hierarchies": ["Nation", "State", "PUMA"] + }, + { + "name": "State", + "description": "US states", + "parent dimension": "Geography", + "hierarchies": ["Nation", "State", "PUMA"] + }, + { + "name": "PUMA", + "description": "US PUMAs (public use microdata areas)", + "parent dimension": "Geography", + "hierarchies": ["Nation", "State", "PUMA"] + }, + { + "name": "Age", + "parent dimension": "Age", + "hierarchies": ["Age"] + }, + { + "name": "Gender", + "parent dimension": "Gender", + "hierarchies": ["Gender"] + }, + { + "name": "Race", + "parent dimension": "Race", + "hierarchies": ["Race"] + }, + { + "name": "Major Occupation Group", + "description": "PUMS Occupation most general classification", + "parent dimension": "PUMS Occupation", + "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] + }, + { + "name": "Minor Occupation Group", + "description": "PUMS minor Occupation classification", + "parent dimension": "PUMS Occupation", + "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] + }, + { + "name": "Broad Occupation", + "description": "PUMS broad Occupation classification", + "parent dimension": "PUMS Occupation", + "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] + }, + { + "name": "Detailed Occupation", + "description": "PUMS most detailed Occupation classification", + "parent dimension": "PUMS Occupation", + "hierarchies": ["Major Occupation Group", "Minor Occupation Group", "Broad Occupation", "Detailed Occupation"] + }, + { + "name": "Industry Sector", + "description": "PUMS industry sector classification (most general classification)", + "parent dimension": "PUMS Industry", + "hierarchies": ["Industry Sector", "Industry Sub-Sector", "Industry Group"] + }, + { + "name": "Industry Sub-Sector", + "description": "PUMS industry sub-sector classification", + "parent dimension": "PUMS Industry", + "hierarchies": ["Industry Sector", "Industry Sub-Sector", "Industry Group"] + }, + { + "name": "Industry Group", + "description": "PUMS industry group classification (most detailed classification)", + "parent dimension": "PUMS Industry", + "hierarchies": ["Industry Sector", "Industry Sub-Sector", "Industry Group"] + } + ] + }, + { + "name": "Data_USA_Senate_election", + "api": "Tesseract", + "description": "Table `Data_USA_Senate_election` has data on number of votes by senate candidate, party and state.", + "measures": [ + { + "name": "Candidate Votes", + "description": "Contains the total votes for a House candidate in a certain year." + }, + { + "name": "Total Votes", + "description": "Contains the total votes in a certain state for a certain year." + } + ], + "variables": [ + { + "name": "Candidate", + "parent dimension": "Candidate", + "hierarchies": ["Candidate"] + }, + { + "name": "State", + "description": "US states", + "parent dimension": "State", + "hierarchies": ["State"] + }, + { + "name": "Party", + "description": "Political party of the candidate", + "parent dimension": "Party", + "hierarchies": ["Party"] + }, + { + "name": "Year", + "parent dimension": "Year", + "hierarchies": ["Year"] + } + ] + }, + { + "name": "Data_USA_President_election", + "api": "Tesseract", + "description": "Table `Data_USA_President_election` has data on number votes by presidential candidate, party and state.", + "measures": [ + { + "name": "Candidate Votes", + "description": "Contains the total votes for a presidential candidate in a certain year." + }, + { + "name": "Total Votes", + "description": "Contains the total votes in a certain state for a certain year." + } + ], + "variables": [ + { + "name": "Candidate", + "parent dimension": "Candidate", + "hierarchies": ["Candidate"] + }, + { + "name": "State", + "description": "US states", + "parent dimension": "Geography", + "hierarchies": ["State"] + }, + { + "name": "County", + "description": "US counties", + "parent dimension": "Geography", + "hierarchies": ["State County", "County"] + }, + { + "name": "Party", + "description": "Political party of the candidate", + "parent dimension": "Party", + "hierarchies": ["Party"] + }, + { + "name": "Year", + "parent dimension": "Year", + "hierarchies": ["Year"] + } + ] + }, + { + "name": "Data_USA_House_election", + "api": "Tesseract", + "description": "Table `Data_USA_House_election` has data on number votes by House candidate, party and state.", + "measures": [ + { + "name": "Candidate Votes", + "description": "Contains the total votes for a presidential candidate in a certain year." + }, + { + "name": "Total Votes", + "description": "Contains the total votes in a certain state for a certain year." + } + ], + "variables": [ + { + "name": "Candidate", + "parent dimension": "Candidate", + "hierarchies": ["Candidate"] + }, + { + "name": "State", + "description": "US states", + "parent dimension": "Geography", + "hierarchies": ["State", "Congressional District"] + }, + { + "name": "Congressional District", + "description": "US counties", + "parent dimension": "Geography", + "hierarchies": ["State", "Congressional District"] + }, + { + "name": "Party", + "description": "Political party of the candidate", + "parent dimension": "Party", + "hierarchies": ["Party"] + }, + { + "name": "Year", + "parent dimension": "Year", + "hierarchies": ["Year"] + } + ] + } + ] +} \ No newline at end of file diff --git a/api/data/tesseract_schema.json b/api/data/tesseract_schema.json new file mode 100644 index 0000000..5ac49fb --- /dev/null +++ b/api/data/tesseract_schema.json @@ -0,0 +1,4910 @@ +{ + "name": "datausa", + "annotations": {}, + "cube_map": { + "Data_USA_House_Compact_election": { + "name": "Data_USA_House_Compact_election", + "table": { + "name": "election_house_compact", + "primary_key": "winning_candidate", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "hidden_dimensions": "Version", + "dataset_link": "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/IG0UN2/8KAB8V", + "source_name": "Federal Election Commision", + "source_description": "By applying scientific principles to how elections are studied and administered, the MIT Election Lab aims to improve the democratic experience for all U.S. voters. The lab supports advances in election science by collecting, analyzing, and sharing core data and findings. They also aim to build relationships with election officials and others to help apply new scientific research to the practice of democracy in the United States.", + "source_link": "https://www.fec.gov/", + "dataset_name": "U.S House 1976-2020", + "topic": "Election", + "subtopic": "House Compact" + }, + "captions": {}, + "dimension_map": { + "Geography": { + "name": "Geography", + "default_hierarchy": "Geography", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "district", + "hierarchy_map": { + "Geography": { + "name": "Geography", + "primary_key": "geoid", + "table": { + "name": "congressional_district", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + }, + "Congressional District": { + "name": "Congressional District", + "depth": 2, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Candidate": { + "name": "Candidate", + "default_hierarchy": "Candidate", + "annotations": { + "dim_type": "CANDIDATE" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "winning_candidate", + "hierarchy_map": { + "Candidate": { + "name": "Candidate", + "primary_key": "winning_candidate", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Candidate": { + "name": "Candidate", + "depth": 1, + "key_column": "winning_candidate", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Special": { + "name": "Special", + "default_hierarchy": "Special", + "annotations": { + "dim_type": "SPECIAL" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "special", + "hierarchy_map": { + "Special": { + "name": "Special", + "primary_key": "special", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Special": { + "name": "Special", + "depth": 1, + "key_column": "special", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u8", + "property_map": {} + } + } + } + } + }, + "Party": { + "name": "Party", + "default_hierarchy": "Party", + "annotations": { + "dim_type": "PARTY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "party", + "hierarchy_map": { + "Party": { + "name": "Party", + "primary_key": "party", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Party": { + "name": "Party", + "depth": 1, + "key_column": "party", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Runoff": { + "name": "Runoff", + "default_hierarchy": "Runoff", + "annotations": { + "dim_type": "RUNOFF" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "runoff", + "hierarchy_map": { + "Runoff": { + "name": "Runoff", + "primary_key": "runoff", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Runoff": { + "name": "Runoff", + "depth": 1, + "key_column": "runoff", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Winner Votes": { + "name": "Winner Votes", + "key_column": "winner_votes", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + }, + "Other Votes": { + "name": "Other Votes", + "key_column": "other_votes", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + }, + "Total Votes": { + "name": "Total Votes", + "key_column": "total_votes", + "aggregator": { + "type": "Max" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "Data_USA_House_election": { + "name": "Data_USA_House_election", + "table": { + "name": "election_house", + "primary_key": "candidate_id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "hidden_dimensions": "Unofficial, Version", + "dataset_link": "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/IG0UN2/8KAB8V", + "source_name": "Federal Election Commision", + "source_link": "https://www.fec.gov/", + "dataset_name": "U.S House 1976-2020", + "available_dimensions": "Candidate, Geography, Party, Year, Candidate Other, Special", + "hide_in_ui": "Unofficial, Version", + "topic": "Election", + "subtopic": "House" + }, + "captions": {}, + "dimension_map": { + "Geography": { + "name": "Geography", + "default_hierarchy": "Geography", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "geo_id", + "hierarchy_map": { + "Geography": { + "name": "Geography", + "primary_key": "geoid", + "table": { + "name": "congressional_district", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + }, + "Congressional District": { + "name": "Congressional District", + "depth": 2, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Candidate": { + "name": "Candidate", + "default_hierarchy": "Candidate", + "annotations": { + "dim_type": "CANDIDATE" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "candidate_id", + "hierarchy_map": { + "Candidate": { + "name": "Candidate", + "primary_key": "candidate_id", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Candidate": { + "name": "Candidate", + "depth": 1, + "key_column": "candidate_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "candidate" + }, + "key_type": "str", + "property_map": { + "Office": { + "name": "Office", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "office" + }, + "key_type": "i64" + } + } + } + } + } + } + }, + "Candidate Other": { + "name": "Candidate Other", + "default_hierarchy": "Candidate Other", + "annotations": { + "dim_type": "CANDIDATE OTHER" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "candidate_other", + "hierarchy_map": { + "Candidate Other": { + "name": "Candidate Other", + "primary_key": "candidate_other", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Candidate Other": { + "name": "Candidate Other", + "depth": 1, + "key_column": "candidate_other", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Party": { + "name": "Party", + "default_hierarchy": "Party", + "annotations": { + "dim_type": "PARTY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "party", + "hierarchy_map": { + "Party": { + "name": "Party", + "primary_key": "party", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Party": { + "name": "Party", + "depth": 1, + "key_column": "party", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Special": { + "name": "Special", + "default_hierarchy": "Special", + "annotations": { + "dim_type": "SPECIAL" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "special", + "hierarchy_map": { + "Special": { + "name": "Special", + "primary_key": "special", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Special": { + "name": "Special", + "depth": 1, + "key_column": "special", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "Runoff": { + "name": "Runoff", + "default_hierarchy": "Runoff", + "annotations": { + "dim_type": "RUNOFF" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "runoff", + "hierarchy_map": { + "Runoff": { + "name": "Runoff", + "primary_key": "runoff", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Runoff": { + "name": "Runoff", + "depth": 1, + "key_column": "runoff", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u8", + "property_map": {} + } + } + } + } + }, + "Unofficial": { + "name": "Unofficial", + "default_hierarchy": "Unofficial", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "unofficial", + "hierarchy_map": { + "Unofficial": { + "name": "Unofficial", + "primary_key": "unofficial", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Unofficial": { + "name": "Unofficial", + "depth": 1, + "key_column": "unofficial", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u8", + "property_map": {} + } + } + } + } + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Candidate Votes": { + "name": "Candidate Votes", + "key_column": "candidatevotes", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + }, + "Total Votes": { + "name": "Total Votes", + "key_column": "totalvotes", + "aggregator": { + "type": "Max" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "Consumer Price Index - CPI": { + "name": "Consumer Price Index - CPI", + "table": { + "name": "cpi_data", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bls.gov/cpi/tables/supplemental-files/home.htm", + "source_name": "Bureau of Labor Statistics", + "source_description": "The Bureau of Labor Statistics (BLS) of the U.S. Department of Labor is the principal federal agency responsible for measuring labor market activity, working conditions, and price changes in the economy.", + "dataset_name": "Consumer Price Index (CPI)", + "topic": "Economy", + "subtopic": "Consumer Price" + }, + "captions": {}, + "dimension_map": { + "Time": { + "name": "Time", + "default_hierarchy": "Time", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "time_id", + "hierarchy_map": { + "Time": { + "name": "Time", + "primary_key": "time_id", + "table": { + "name": "dim_months_cpi", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + }, + "Month and Year": { + "name": "Month and Year", + "depth": 2, + "key_column": "time_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "month_name" + }, + "key_type": "u32", + "property_map": {} + } + } + } + } + }, + "Product or Service": { + "name": "Product or Service", + "default_hierarchy": "Product Level 7", + "annotations": { + "dim_type": "Product or Service" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "product_id", + "hierarchy_map": { + "Product Level 1": { + "name": "Product Level 1", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level1", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 1.1": { + "name": "Level 1.1", + "depth": 1, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + }, + "Product Level 2": { + "name": "Product Level 2", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level2", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 2.1": { + "name": "Level 2.1", + "depth": 1, + "key_column": "level1_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level1_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 2.2": { + "name": "Level 2.2", + "depth": 2, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + }, + "Product Level 3": { + "name": "Product Level 3", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level3", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 3.1": { + "name": "Level 3.1", + "depth": 1, + "key_column": "level1_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level1_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 3.2": { + "name": "Level 3.2", + "depth": 2, + "key_column": "level2_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level2_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 3.3": { + "name": "Level 3.3", + "depth": 3, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + }, + "Product Level 4": { + "name": "Product Level 4", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level4", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 4.1": { + "name": "Level 4.1", + "depth": 1, + "key_column": "level1_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level1_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 4.2": { + "name": "Level 4.2", + "depth": 2, + "key_column": "level2_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level2_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 4.3": { + "name": "Level 4.3", + "depth": 3, + "key_column": "level3_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level3_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 4.4": { + "name": "Level 4.4", + "depth": 4, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + }, + "Product Level 5": { + "name": "Product Level 5", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level5", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 5.1": { + "name": "Level 5.1", + "depth": 1, + "key_column": "level1_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level1_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 5.2": { + "name": "Level 5.2", + "depth": 2, + "key_column": "level2_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level2_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 5.3": { + "name": "Level 5.3", + "depth": 3, + "key_column": "level3_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level3_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 5.4": { + "name": "Level 5.4", + "depth": 4, + "key_column": "level4_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level4_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 5.5": { + "name": "Level 5.5", + "depth": 5, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + }, + "Product Level 6": { + "name": "Product Level 6", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level6", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 6.1": { + "name": "Level 6.1", + "depth": 1, + "key_column": "level1_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level1_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 6.2": { + "name": "Level 6.2", + "depth": 2, + "key_column": "level2_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level2_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 6.3": { + "name": "Level 6.3", + "depth": 3, + "key_column": "level3_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level3_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 6.4": { + "name": "Level 6.4", + "depth": 4, + "key_column": "level4_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level4_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 6.5": { + "name": "Level 6.5", + "depth": 5, + "key_column": "level5_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level5_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 6.6": { + "name": "Level 6.6", + "depth": 6, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + }, + "Product Level 7": { + "name": "Product Level 7", + "primary_key": "product_id", + "table": { + "name": "dim_cpi_level7", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Level 7.1": { + "name": "Level 7.1", + "depth": 1, + "key_column": "level1_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level1_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 7.2": { + "name": "Level 7.2", + "depth": 2, + "key_column": "level2_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level2_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 7.3": { + "name": "Level 7.3", + "depth": 3, + "key_column": "level3_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level3_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 7.4": { + "name": "Level 7.4", + "depth": 4, + "key_column": "level4_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level4_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 7.5": { + "name": "Level 7.5", + "depth": 5, + "key_column": "level5_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level5_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 7.6": { + "name": "Level 7.6", + "depth": 6, + "key_column": "level6_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "level6_name" + }, + "key_type": "u64", + "property_map": {} + }, + "Level 7.7": { + "name": "Level 7.7", + "depth": 7, + "key_column": "product_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "product_name" + }, + "key_type": "u64", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Consumer Price Index": { + "name": "Consumer Price Index", + "key_column": "unadjusted_percent_change", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "Standard Error": { + "name": "Standard Error", + "key_column": "standard_error", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "Percent Change": { + "name": "Percent Change", + "key_column": "percent_change", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": true, + "visible": true + }, + "bls_growth_industry": { + "name": "bls_growth_industry", + "table": { + "name": "bls_growth_industry", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bls.gov/bls/industry.htm", + "source_name": "Bureau of Labor Statistics", + "source_description": "The Bureau of Labor Statistics (BLS) of the U.S. Department of Labor is the principal federal agency responsible for measuring labor market activity, working conditions, and price changes in the economy.", + "dataset_name": "BLS Statistics by Industry, Growth", + "topic": "Economy", + "subtopic": "Industry" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "BLS Industry Flat": { + "name": "BLS Industry Flat", + "source": "BLS Industry Flat", + "foreign_key": "naics_code", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + } + }, + "measure_map": { + "Industry Jobs": { + "name": "Industry Jobs", + "key_column": "emp", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Jobs", + "pre_aggregation_method": "SUM" + }, + "captions": {}, + "submeasures": {} + }, + "Industry Jobs Change": { + "name": "Industry Jobs Change", + "key_column": "emp_change", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Jobs", + "pre_aggregation_method": "SUM" + }, + "captions": {}, + "submeasures": {} + }, + "Industry Jobs CARC": { + "name": "Industry Jobs CARC", + "key_column": "emp_carc", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Rate", + "pre_aggregation_method": "CARC", + "details": "Compound Annual Rate of Change" + }, + "captions": {}, + "submeasures": {} + }, + "Industry Output": { + "name": "Industry Output", + "key_column": "output_billions", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "USD", + "pre_aggregation_method": "SUM", + "details": "Billions of Chained 2009 Dollars" + }, + "captions": {}, + "submeasures": {} + }, + "Industry Output CARC": { + "name": "Industry Output CARC", + "key_column": "output_carc", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Rate", + "pre_aggregation_method": "CARC", + "details": "Compound Annual Rate of Change" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "onet_by_cip": { + "name": "onet_by_cip", + "table": { + "name": "onet_by_cip", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "http://www.onetonline.org/", + "source_name": "O*NET Online", + "source_description": "The O*Net Skills is a dataset containing detailed descriptions of the required and used skills for specific occupations. The O*Net dataset is sponsored by the United States Department of Labor.", + "dataset_name": "O*NET by Classification of Instructional Programs", + "topic": "Education", + "subtopic": "Skills" + }, + "captions": {}, + "dimension_map": { + "CIP": { + "name": "CIP", + "source": "CIP", + "foreign_key": "cip_code", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Skill Element": { + "name": "Skill Element", + "source": "Skill Element", + "foreign_key": "element_id", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "Year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "IM Value": { + "name": "IM Value", + "key_column": "im", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "NONE", + "details": "Not used for presentation", + "hide_in_ui": "true" + }, + "captions": {}, + "submeasures": {} + }, + "LV Value": { + "name": "LV Value", + "key_column": "lv", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "NONE", + "details": "Not used for presentation", + "hide_in_ui": "true" + }, + "captions": {}, + "submeasures": {} + }, + "Total Score": { + "name": "Total Score", + "key_column": "total_score", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "NONE", + "details": "Calculated by IM Value * LV Value" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "usa_spending": { + "name": "usa_spending", + "table": { + "name": "usa_spending", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.usaspending.gov/", + "source_name": "USAspending.gov", + "source_description": "USA Spending provides a big-picture view of the federal spending landscape.", + "source_link": "https://www.usaspending.gov/", + "dataset_name": "Award Data Archive", + "topic": "Economy", + "subtopic": "Government Spending" + }, + "captions": {}, + "dimension_map": { + "Geography": { + "name": "Geography", + "default_hierarchy": "Nation", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "fips_county", + "hierarchy_map": { + "Nation": { + "name": "Nation", + "primary_key": "geoid", + "table": { + "name": "us_nation", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Nation": { + "name": "Nation", + "depth": 1, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + }, + "County": { + "name": "County", + "primary_key": "geoid", + "table": { + "name": "counties_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "i64", + "property_map": {} + }, + "County": { + "name": "County", + "depth": 2, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "Action Date": { + "name": "Action Date", + "default_hierarchy": "Action Date", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "action_date", + "hierarchy_map": { + "Action Date": { + "name": "Action Date", + "primary_key": "date_id", + "table": { + "name": "dim_date", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "i64", + "property_map": {} + }, + "Quarter": { + "name": "Quarter", + "depth": 2, + "key_column": "quarter", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "i64", + "property_map": {} + }, + "Month": { + "name": "Month", + "depth": 3, + "key_column": "month", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "i64", + "property_map": {} + }, + "Day": { + "name": "Day", + "depth": 4, + "key_column": "date_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "day" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "Fiscal Year": { + "name": "Fiscal Year", + "default_hierarchy": "Fiscal Year", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "fiscal_year", + "hierarchy_map": { + "Fiscal Year": { + "name": "Fiscal Year", + "primary_key": "fiscal_year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Fiscal Year": { + "name": "Fiscal Year", + "depth": 1, + "key_column": "fiscal_year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Transaction Type": { + "name": "Transaction Type", + "default_hierarchy": "Transaction Type", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "transaction_type_id", + "hierarchy_map": { + "Transaction Type": { + "name": "Transaction Type", + "primary_key": "transaction_type_id", + "table": { + "name": "dim_type", + "headers": [ + "transaction_type_id", + "transaction_type", + "transaction_type_parent" + ], + "types": [ + "u8", + "str", + "str" + ], + "rows": [ + [ + 0, + "Contract", + "Contract" + ], + [ + 2, + "Block grant", + "Grant" + ], + [ + 3, + "Formula grant", + "Grant" + ], + [ + 4, + "Project grant", + "Grant" + ], + [ + 5, + "Cooperative agreement", + "Grant" + ], + [ + 6, + "Direct payment for specified use, as a subsidy or other non-reimbursable direct financial aid", + "Direct payments" + ], + [ + 7, + "Direct loan", + "Loans" + ], + [ + 8, + "Guaranteed/insured loan", + "Loans" + ], + [ + 9, + "Insurance", + "Other" + ], + [ + 10, + "Direct payment with unrestricted use (retirement, pension, veterans benefits, etc.)", + "Direct payments" + ], + [ + 11, + "Other reimbursable, contingent, intangible, or indirect financial assistance", + "Other" + ] + ] + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Transaction Type Parent": { + "name": "Transaction Type Parent", + "depth": 1, + "key_column": "transaction_type_parent", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "i64", + "property_map": {} + }, + "Transaction Type": { + "name": "Transaction Type", + "depth": 2, + "key_column": "transaction_type_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "transaction_type" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "Agency": { + "name": "Agency", + "default_hierarchy": "Agency", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "award_subagency_id", + "hierarchy_map": { + "Agency": { + "name": "Agency", + "primary_key": "sub_agency_code", + "table": { + "name": "dim_agency", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Department": { + "name": "Department", + "depth": 1, + "key_column": "agency_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "agency" + }, + "key_type": "i64", + "property_map": {} + }, + "Agency": { + "name": "Agency", + "depth": 2, + "key_column": "sub_agency_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "sub_agency" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "Product Service Code": { + "name": "Product Service Code", + "default_hierarchy": "Product Service Code", + "annotations": { + "dim_type": "PSC" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "psc", + "hierarchy_map": { + "Product Service Code": { + "name": "Product Service Code", + "primary_key": "sub_psc_code", + "table": { + "name": "dim_psc", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "PSC Group": { + "name": "PSC Group", + "depth": 1, + "key_column": "psc_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "psc" + }, + "key_type": "i64", + "property_map": {} + }, + "PSC Sub Group": { + "name": "PSC Sub Group", + "depth": 2, + "key_column": "sub_psc_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "sub_psc" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "NAPCS": { + "name": "NAPCS", + "default_hierarchy": "NAPCS", + "annotations": { + "dim_type": "NAPCS" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "napcs_code", + "hierarchy_map": { + "NAPCS": { + "name": "NAPCS", + "primary_key": "napcs5", + "table": { + "name": "dim_napcs", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "NAPCS Section": { + "name": "NAPCS Section", + "depth": 1, + "key_column": "napcs2", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "napcs2_title" + }, + "key_type": "i64", + "property_map": {} + }, + "NAPCS Group": { + "name": "NAPCS Group", + "depth": 2, + "key_column": "napcs3", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "napcs3_title" + }, + "key_type": "i64", + "property_map": {} + }, + "NAPCS Class": { + "name": "NAPCS Class", + "depth": 3, + "key_column": "napcs5", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "napcs5_title" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Obligation Amount": { + "name": "Obligation Amount", + "key_column": "obligation_amt", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "USD" + }, + "captions": {}, + "submeasures": {} + }, + "Total Loan Value": { + "name": "Total Loan Value", + "key_column": "total_loan_value", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "USD" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": true, + "visible": true + }, + "health_opioid_overdose_deathrate": { + "name": "health_opioid_overdose_deathrate", + "table": { + "name": "health_opioid_overdose_deathrate", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.kff.org/other/state-indicator/opioid-overdose-death-rates/?currentTimeframe=0&sortModel=%7B%22colId%22:%22Location%22,%22sort%22:%22asc%22%7D", + "source_name": "Kaiser Family Foundation", + "source_description": "State Health Facts provides free, up-to-date, health data for all 50 states, the District of Columbia, the United States, counties, territories, and other geographies.", + "source_link": "https://www.kff.org/", + "dataset_name": "State Health Facts", + "topic": "Health", + "subtopic": "Behavioral Health Conditions" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Geography": { + "name": "Geography", + "default_hierarchy": "Nation", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "geo", + "hierarchy_map": { + "Nation": { + "name": "Nation", + "primary_key": "geoid", + "table": { + "name": "us_nation", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Nation": { + "name": "Nation", + "depth": 1, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + }, + "State": { + "name": "State", + "primary_key": "geoid", + "table": { + "name": "states_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + }, + "County": { + "name": "County", + "primary_key": "geoid", + "table": { + "name": "counties_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State County": { + "name": "State County", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "i64", + "property_map": {} + }, + "County": { + "name": "County", + "depth": 2, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Opioid overdose death rate": { + "name": "Opioid overdose death rate", + "key_column": "opioid_overdose_deathrate", + "aggregator": { + "type": "Median" + }, + "annotations": { + "units_of_measurement": "People", + "details": "Among the deaths with drug overdose as the underlying cause, the type of opioid involved is indicated by ICD-10 multiple cause-of-death codes. Age-adjusted death rates were calculated by applying age-specific death rates to the 2000 U.S. standard population age distribution." + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": true, + "visible": true + }, + "BLS Employment - Industry Only": { + "name": "BLS Employment - Industry Only", + "table": { + "name": "bls_industry_fact", + "primary_key": "industry_id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bls.gov/data/#employment", + "source_name": "Bureau of Labor Statistics", + "source_description": "The Bureau of Labor Statistics (BLS) of the U.S. Department of Labor is the principal federal agency responsible for measuring labor market activity, working conditions, and price changes in the economy.", + "dataset_name": "Current Employment Statistics (CES)", + "topic": "Economy", + "subtopic": "Employment" + }, + "captions": {}, + "dimension_map": { + "Time": { + "name": "Time", + "default_hierarchy": "Time", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "time_id", + "hierarchy_map": { + "Time": { + "name": "Time", + "primary_key": "time_id", + "table": { + "name": "dim_time", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Month of Year": { + "name": "Month of Year", + "depth": 1, + "key_column": "time_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "month_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Employment State": { + "name": "Employment State", + "default_hierarchy": "Employment State", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "state_id", + "hierarchy_map": { + "Employment State": { + "name": "Employment State", + "primary_key": "state_id", + "table": { + "name": "dim_state", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Employment State": { + "name": "Employment State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Industry": { + "name": "Industry", + "default_hierarchy": "Industry", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "industry_id", + "hierarchy_map": { + "Industry": { + "name": "Industry", + "primary_key": "industry_id", + "table": { + "name": "dim_industry", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Industry": { + "name": "Industry", + "depth": 1, + "key_column": "industry_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "industry_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "NSA Employees": { + "name": "NSA Employees", + "key_column": "NSA_employees", + "aggregator": { + "type": "Sum" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "NSA Average Employees": { + "name": "NSA Average Employees", + "key_column": "NSA_employees", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "SA Employees": { + "name": "SA Employees", + "key_column": "SA_employees", + "aggregator": { + "type": "Sum" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "SA Average Employees": { + "name": "SA Average Employees", + "key_column": "SA_employees", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "Data_USA_Electoral_College_president": { + "name": "Data_USA_Electoral_College_president", + "table": { + "name": "election_electoralcollege", + "primary_key": "geoid", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://transition.fec.gov/pubrec/fe2004/federalelections2004.xls", + "source_name": "Federal Election Commision", + "source_link": "https://www.fec.gov/", + "dataset_name": "Federal Elections 2004: Election Results for the U.S. President, the U.S. Senate, and the U.S. House of Representatives", + "available_dimensions": "Geography, Party, Year", + "topic": "Election", + "subtopic": "Electoral College" + }, + "captions": {}, + "dimension_map": { + "State": { + "name": "State", + "source": "State Election", + "foreign_key": "geoid", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Party": { + "name": "Party", + "default_hierarchy": "Party", + "annotations": { + "dim_type": "PARTY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "party", + "hierarchy_map": { + "Party": { + "name": "Party", + "primary_key": "party", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Party": { + "name": "Party", + "depth": 1, + "key_column": "party", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Electoral College Votes": { + "name": "Electoral College Votes", + "key_column": "electoralvote", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "bea_use": { + "name": "bea_use", + "table": { + "name": "bea_use", + "primary_key": "commodity_iocode", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bea.gov/industry/input-output-accounts-data", + "source_name": "Bureau of Economic Analysis", + "source_description": "The Bureau of Economic Analysis (BEA) publishes data on Input-Output, also called Make-Use, for industries in the United States. This Dataset is provided by the US Department of Commerce. Use of commodities by industry are valued at producers prices.", + "source_link": "https://bea.gov/", + "dataset_name": "Use Tables", + "topic": "Economy", + "subtopic": "Industry Flows" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "Year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Industry IO Code": { + "name": "Industry IO Code", + "default_hierarchy": "Industry IO Code", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "industry_iocode", + "hierarchy_map": { + "Industry IO Code": { + "name": "Industry IO Code", + "primary_key": "industry_iocode", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Industry L0": { + "name": "Industry L0", + "depth": 1, + "key_column": "industry_iocode_parent", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "industry_iocode_parent_name" + }, + "key_type": "str", + "property_map": {} + }, + "Industry L1": { + "name": "Industry L1", + "depth": 2, + "key_column": "industry_iocode", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "industry_iocode_description" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Commodity IO Code": { + "name": "Commodity IO Code", + "default_hierarchy": "Commodity IO Code", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "commodity_iocode", + "hierarchy_map": { + "Commodity IO Code": { + "name": "Commodity IO Code", + "primary_key": "commodity_iocode", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Commodity L0": { + "name": "Commodity L0", + "depth": 1, + "key_column": "commodity_iocode_parent", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "commodity_iocode_parent_name" + }, + "key_type": "str", + "property_map": {} + }, + "Commodity L1": { + "name": "Commodity L1", + "depth": 2, + "key_column": "commodity_iocode", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "commodity_iocode_description" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Value Millions": { + "name": "Value Millions", + "key_column": "value_millions", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "USD" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "Data_USA_President_election": { + "name": "Data_USA_President_election", + "table": { + "name": "election_president", + "primary_key": "candidate_id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "hidden_dimensions": "Version", + "dataset_link": "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/VOQCHQ/FQ9NBF", + "source_name": "Federal Election Commision", + "source_link": "https://www.fec.gov/", + "dataset_name": "County Presidential Election Returns 2000-2020", + "available_dimensions": "Candidate, Geography, Party, Year", + "topic": "Election", + "subtopic": "President" + }, + "captions": {}, + "dimension_map": { + "Geography": { + "name": "Geography", + "default_hierarchy": "Nation", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "geo_id", + "hierarchy_map": { + "Nation": { + "name": "Nation", + "primary_key": "geoid", + "table": { + "name": "us_nation", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": [ + "Nation", + "01000US" + ], + "level_map": { + "Nation": { + "name": "Nation", + "depth": 1, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + }, + "State": { + "name": "State", + "primary_key": "geoid", + "table": { + "name": "states_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + }, + "County": { + "name": "County", + "primary_key": "geoid", + "table": { + "name": "counties_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State County": { + "name": "State County", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + }, + "County": { + "name": "County", + "depth": 2, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Candidate": { + "name": "Candidate", + "default_hierarchy": "Candidate", + "annotations": { + "dim_type": "CANDIDATE" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "candidate_id", + "hierarchy_map": { + "Candidate": { + "name": "Candidate", + "primary_key": "candidate_id", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Candidate": { + "name": "Candidate", + "depth": 1, + "key_column": "candidate_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "candidate" + }, + "key_type": "str", + "property_map": { + "Office": { + "name": "Office", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "office" + }, + "key_type": "i64" + } + } + } + } + } + } + }, + "Party": { + "name": "Party", + "default_hierarchy": "Party", + "annotations": { + "dim_type": "PARTY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "party", + "hierarchy_map": { + "Party": { + "name": "Party", + "primary_key": "party", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Party": { + "name": "Party", + "depth": 1, + "key_column": "party", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "YEAR" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Candidate Votes": { + "name": "Candidate Votes", + "key_column": "candidatevotes", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + }, + "Total Votes": { + "name": "Total Votes", + "key_column": "totalvotes", + "aggregator": { + "type": "Max" + }, + "annotations": { + "units_of_measurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": true, + "visible": true + }, + "dot_faf": { + "name": "dot_faf", + "table": { + "name": "dot_faf", + "primary_key": "region_id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://ops.fhwa.dot.gov/freight/freight_analysis/faf/", + "source_name": "Department of Transportation Federal Highway Administration", + "source_description": "The Freight Analysis Framework (FAF), produced through a partnership between Bureau of Transportation Statistics (BTS) and Federal Highway Administration (FHWA), integrates data from a variety of sources to create a comprehensive picture of freight movement among states and major metropolitan areas by all modes of transportation. Starting with data from the 2012 Commodity Flow Survey (CFS) and international trade data from the Census Bureau, FAF incorporates data from agriculture, extraction, utility, construction, service, and other sectors. The FAF data give a picture of which goods are shipped from one region of the US to another region, according to type of commodity, mode of shipment, value, and weight.", + "source_link": "https://www.fhwa.dot.gov/", + "dataset_name": "Freight Analysis Framework Domestic Flows", + "topic": "Economy", + "subtopic": "Freight" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "Year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": { + "Year Base": { + "name": "Year Base", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "year_base" + }, + "key_type": "i64" + } + } + } + } + } + } + }, + "Origin": { + "name": "Origin", + "default_hierarchy": "Origin", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "origin_region", + "hierarchy_map": { + "Origin": { + "name": "Origin", + "primary_key": "region_id", + "table": { + "name": "faf_regions", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Origin State": { + "name": "Origin State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + }, + "Origin Region": { + "name": "Origin Region", + "depth": 2, + "key_column": "region_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "region_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Destination": { + "name": "Destination", + "default_hierarchy": "Destination", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "destination_region", + "hierarchy_map": { + "Destination": { + "name": "Destination", + "primary_key": "region_id", + "table": { + "name": "faf_regions", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Destination State": { + "name": "Destination State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + }, + "Destination Region": { + "name": "Destination Region", + "depth": 2, + "key_column": "region_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "region_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "SCTG": { + "name": "SCTG", + "default_hierarchy": "SCTG", + "annotations": { + "dim_type": "PRODUCT" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "sctg", + "hierarchy_map": { + "SCTG": { + "name": "SCTG", + "primary_key": "sctg_code", + "table": { + "name": "sctg", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "SCTG2": { + "name": "SCTG2", + "depth": 1, + "key_column": "sctg_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "sctg_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Transportation Mode": { + "name": "Transportation Mode", + "default_hierarchy": "Transportation Mode", + "annotations": { + "dim_type": "GENERIC" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "mode", + "hierarchy_map": { + "Transportation Mode": { + "name": "Transportation Mode", + "primary_key": "id", + "table": { + "name": "transportation_mode", + "headers": [ + "id", + "name", + "description" + ], + "types": [ + "u8", + "str", + "str" + ], + "rows": [ + [ + 1, + "Truck", + "Includes private and for-hire truck. Does not include truck that is part of Multiple Modes and Mail or truck moves in conjunction with domestic air cargo." + ], + [ + 2, + "Rail", + "Includes any common carrier or private railroad. Does not include rail that is part of Multiple Modes and Mail." + ], + [ + 3, + "Water", + "Includes shallow draft, deep draft, Great Lakes and intra-port shipments. Does not include water that is part of Multiple Modes and Mail." + ], + [ + 4, + "Air (includes truck-air)", + "Includes shipments move by air or a combination of truck and air in commercial or private aircraft. Includes air freight and air express. In the case of imports and exports by air, domestic moves by ground to and from the port of entry or exit are categorized with Truck." + ], + [ + 5, + "Multiple Modes and Mail", + "Includes shipments by multiple modes and by parcel delivery services, U.S. Postal Service, or couriers (capped at 150 pounds). This category is not limited to containerized or trailer-on-flatcar shipments." + ], + [ + 6, + "Pipeline", + "Includes crude petroleum, natural gas, and product pipelines. Note: Does include flows from offshore wells to land which are counted as Water moves by the U.S. Army Corps of Engineers. Does not include pipeline that is part of Multiple Modes and Mail." + ], + [ + 7, + "Other and Unknown", + "Includes movements not elsewhere classified such as flyaway aircraft, and shipments for which the mode cannot be determined." + ], + [ + 8, + "No domestic mode", + "Includes shipments that have an international mode, but no domestic mode and is limited to import shipments of crude petroleum transferred directly from inbound ships to a U.S. refinery at the zone of entry. This classification enables a proper accounting of flows that do not utilize any domestic transportation network.." + ] + ] + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Transportation Mode": { + "name": "Transportation Mode", + "depth": 1, + "key_column": "id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "u8", + "property_map": { + "Description": { + "name": "Description", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "description" + }, + "key_type": "i64" + } + } + } + } + } + } + } + }, + "measure_map": { + "Millions Of Dollars": { + "name": "Millions Of Dollars", + "key_column": "value_millions", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "USD", + "details": "Value in millions of 2012 US Dollars." + }, + "captions": {}, + "submeasures": {} + }, + "Thousands Of Tons": { + "name": "Thousands Of Tons", + "key_column": "thousand_tons", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "Thousands of Tons", + "details": "Weight in thousands of tons." + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "BLS Unemployment Insurance Claims - Most Recent": { + "name": "BLS Unemployment Insurance Claims - Most Recent", + "table": { + "name": "bls_insurance_most_recent", + "primary_key": "week_ended", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "source_name": "U.S Department of Labor", + "source_link": "https://www.dol.gov/", + "topic": "Economy", + "subtopic": "Employment" + }, + "captions": {}, + "dimension_map": { + "State": { + "name": "State", + "source": "State", + "foreign_key": "fips_code", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Week Ended": { + "name": "Week Ended", + "default_hierarchy": "Week Ended", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "week_ended", + "hierarchy_map": { + "Week Ended": { + "name": "Week Ended", + "primary_key": "date", + "table": { + "name": "dim_shared_date", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Week Ended": { + "name": "Week Ended", + "depth": 1, + "key_column": "date", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Week Previous": { + "name": "Week Previous", + "default_hierarchy": "Week Previous", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "reflecting_week_end", + "hierarchy_map": { + "Week Previous": { + "name": "Week Previous", + "primary_key": "date", + "table": { + "name": "dim_shared_date", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Week Previous": { + "name": "Week Previous", + "depth": 1, + "key_column": "date", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Initial Claims": { + "name": "Initial Claims", + "key_column": "initial_claims", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "details": "Value" + }, + "captions": {}, + "submeasures": {} + }, + "Continued Claims": { + "name": "Continued Claims", + "key_column": "continued_claims", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "details": "Value" + }, + "captions": {}, + "submeasures": {} + }, + "Covered Employment": { + "name": "Covered Employment", + "key_column": "covered_employment", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "details": "Value" + }, + "captions": {}, + "submeasures": {} + }, + "Insured Unemployment Rate": { + "name": "Insured Unemployment Rate", + "key_column": "insured_unemployment_rate", + "aggregator": { + "type": "Average" + }, + "annotations": { + "details": "Rate" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "ed_defaults": { + "name": "ed_defaults", + "table": { + "name": "ed_defaults", + "primary_key": "opeid", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_description": "According to the Department of Education: 'A cohort default rate is the percentage of a school's borrowers who enter repayment on certain Federal Family Education Loan (FFEL) Program or William D. Ford Federal Direct Loan (Direct Loan) Program loans during a particular federal fiscal year (FY), October 1 to September 30, and default or meet other specified conditions prior to the end of the second following fiscal year.'", + "dataset_link": "https://www2.ed.gov/offices/OSFAP/defaultmanagement/cdr.html", + "source_name": "Department of Education", + "source_description": "The Department of Education's 'mission is to promote student achievement and preparation for global competitiveness by fostering educational excellence and ensuring equal access.'", + "source_link": "https://www.ed.gov/", + "dataset_name": "Cohort Default Rate Database", + "topic": "Education", + "subtopic": "Default Rate" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "Year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Geography": { + "name": "Geography", + "default_hierarchy": "Geography", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "county", + "hierarchy_map": { + "Geography": { + "name": "Geography", + "primary_key": "geoid", + "table": { + "name": "counties_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + }, + "County": { + "name": "County", + "depth": 2, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "OPEID": { + "name": "OPEID", + "default_hierarchy": "OPEID", + "annotations": { + "dim_type": "OPEID" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "opeid", + "hierarchy_map": { + "OPEID": { + "name": "OPEID", + "primary_key": "opeid", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "OPEID": { + "name": "OPEID", + "depth": 1, + "key_column": "opeid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "opeid_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Borrowers In Default": { + "name": "Borrowers In Default", + "key_column": "num", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "People" + }, + "captions": {}, + "submeasures": {} + }, + "Borrowers Entered Repayment": { + "name": "Borrowers Entered Repayment", + "key_column": "denom", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_of_measurement": "People" + }, + "captions": {}, + "submeasures": {} + }, + "Default Rate": { + "name": "Default Rate", + "key_column": "default_rate", + "aggregator": { + "type": "Average" + }, + "annotations": { + "units_of_measurement": "Rate" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "bls_ces": { + "name": "bls_ces", + "table": { + "name": "bls_ces", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bls.gov/ces/", + "source_name": "Bureau of Labor Statistics", + "source_description": "The Bureau of Labor Statistics (BLS) of the U.S. Department of Labor is the principal federal agency responsible for measuring labor market activity, working conditions, and price changes in the economy.", + "dataset_name": "Current Employment Statistics", + "topic": "Economy", + "subtopic": "Industry" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "BLS Industry Flat": { + "name": "BLS Industry Flat", + "source": "BLS Industry Flat", + "foreign_key": "naics_code", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + } + }, + "measure_map": { + "Industry Average Hourly Earnings": { + "name": "Industry Average Hourly Earnings", + "key_column": "avg_hrly_earnings", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "USD", + "pre_aggregation_method": "AVG" + }, + "captions": {}, + "submeasures": {} + }, + "Industry Average Weekly Hours": { + "name": "Industry Average Weekly Hours", + "key_column": "avg_weekly_hours", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Hours", + "pre_aggregation_method": "AVG" + }, + "captions": {}, + "submeasures": {} + }, + "Industry Employees Thousands": { + "name": "Industry Employees Thousands", + "key_column": "employees_thousands", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Employees", + "pre_aggregation_method": "SUM" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "health_estimates_of_chronically_homeless_individuals": { + "name": "health_estimates_of_chronically_homeless_individuals", + "table": { + "name": "health_estimates_of_chronically_homeless_individuals", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.hudexchange.info/resources/documents/2017-AHAR-Part-1.pdf", + "source_name": "Department of Housing and Urban Development (HUD)", + "source_description": "Part 1 of the Annual Homeless Assessment Report to Congress (AHAR) provides Point-inTime (PIT) estimates, offering a snapshot of homelessness—both sheltered and unsheltered— on a single night. The PIT counts also provide an estimate of the number of people experiencing homelessness within particular homeless populations, such as people with chronic patterns of homelessness and veterans experiencing homelessness.", + "source_link": "http://hud.gov/", + "dataset_name": "The 2017 Annual Homeless Assessment Report (AHAR) to Congress, Part 1", + "topic": "Health", + "subtopic": "Drivers of Health" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "Geography": { + "name": "Geography", + "default_hierarchy": "State", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "geo", + "hierarchy_map": { + "State": { + "name": "State", + "primary_key": "geoid", + "table": { + "name": "states_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "geoid", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Estimates of Chronically Homeless Individuals": { + "name": "Estimates of Chronically Homeless Individuals", + "key_column": "state_chronically_homeless_individuals", + "aggregator": { + "type": "Median" + }, + "annotations": { + "units_of_measurement": "Number", + "details": "A chronically homeless individual refers to an individual with a disability who has been continuously homeless for one year or more or has experienced at least four episodes of homelessness in the last three years where the combined length of time homeless in those occasions is at least 12 months." + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "Data_USA_Senate_election": { + "name": "Data_USA_Senate_election", + "table": { + "name": "election_senate", + "primary_key": "candidate_id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/PEJ5QU/XXQCIK", + "source_name": "Federal Election Commision", + "source_link": "https://www.fec.gov/", + "dataset_name": "U.S Senate 1976-2020", + "available_dimensions": "Candidate, Geography, Party, Year, Candidate Other, Special", + "topic": "Election", + "subtopic": "Senate" + }, + "captions": {}, + "dimension_map": { + "State": { + "name": "State", + "source": "State Election", + "foreign_key": "geo_id", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Candidate": { + "name": "Candidate", + "default_hierarchy": "Candidate", + "annotations": { + "dim_type": "CANDIDATE" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "candidate_id", + "hierarchy_map": { + "Candidate": { + "name": "Candidate", + "primary_key": "candidate_id", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Candidate": { + "name": "Candidate", + "depth": 1, + "key_column": "candidate_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "candidate" + }, + "key_type": "str", + "property_map": { + "Office": { + "name": "Office", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "office" + }, + "key_type": "i64" + } + } + } + } + } + } + }, + "Candidate Other": { + "name": "Candidate Other", + "default_hierarchy": "Candidate Other", + "annotations": { + "dim_type": "CANDIDATE OTHER" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "candidate_other", + "hierarchy_map": { + "Candidate Other": { + "name": "Candidate Other", + "primary_key": "candidate_other", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Candidate Other": { + "name": "Candidate Other", + "depth": 1, + "key_column": "candidate_other", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Party": { + "name": "Party", + "default_hierarchy": "Party", + "annotations": { + "dim_type": "PARTY" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "party", + "hierarchy_map": { + "Party": { + "name": "Party", + "primary_key": "party", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Party": { + "name": "Party", + "depth": 1, + "key_column": "party", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Special": { + "name": "Special", + "default_hierarchy": "Special", + "annotations": { + "dim_type": "SPECIAL" + }, + "captions": {}, + "dim_type": "standard", + "foreign_key": "special", + "hierarchy_map": { + "Special": { + "name": "Special", + "primary_key": "special", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Special": { + "name": "Special", + "depth": 1, + "key_column": "special", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u8", + "property_map": {} + } + } + } + } + }, + "Unofficial": { + "name": "Unofficial", + "default_hierarchy": "Unofficial", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "unofficial", + "hierarchy_map": { + "Unofficial": { + "name": "Unofficial", + "primary_key": "unofficial", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Unofficial": { + "name": "Unofficial", + "depth": 1, + "key_column": "unofficial", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u8", + "property_map": {} + } + } + } + } + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "YEAR" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Candidate Votes": { + "name": "Candidate Votes", + "key_column": "candidatevotes", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + }, + "Total Votes": { + "name": "Total Votes", + "key_column": "totalvotes", + "aggregator": { + "type": "Max" + }, + "annotations": { + "units_ofmeasurement": "VOTES" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "BLS Employment - Supersector Only": { + "name": "BLS Employment - Supersector Only", + "table": { + "name": "bls_supersector_fact", + "primary_key": "supersector_id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bls.gov/data/#employment", + "source_name": "Bureau of Labor Statistics", + "source_description": "The Bureau of Labor Statistics (BLS) of the U.S. Department of Labor is the principal federal agency responsible for measuring labor market activity, working conditions, and price changes in the economy.", + "dataset_name": "Current Employment Statistics (CES)", + "topic": "Economy", + "subtopic": "Employment" + }, + "captions": {}, + "dimension_map": { + "Time": { + "name": "Time", + "default_hierarchy": "Time", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "time_id", + "hierarchy_map": { + "Time": { + "name": "Time", + "primary_key": "time_id", + "table": { + "name": "dim_time", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Month of Year": { + "name": "Month of Year", + "depth": 1, + "key_column": "time_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "month_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Employment State": { + "name": "Employment State", + "default_hierarchy": "Employment State", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "state_id", + "hierarchy_map": { + "Employment State": { + "name": "Employment State", + "primary_key": "state_id", + "table": { + "name": "dim_state", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Employment State": { + "name": "Employment State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Supersector": { + "name": "Supersector", + "default_hierarchy": "Supersector", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": "supersector_id", + "hierarchy_map": { + "Supersector": { + "name": "Supersector", + "primary_key": "supersector_id", + "table": { + "name": "dim_supersector", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Supersector": { + "name": "Supersector", + "depth": 1, + "key_column": "supersector_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "supersector_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "NSA Employees": { + "name": "NSA Employees", + "key_column": "NSA_employees", + "aggregator": { + "type": "Sum" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "NSA Average Employees": { + "name": "NSA Average Employees", + "key_column": "NSA_employees", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "SA Employees": { + "name": "SA Employees", + "key_column": "SA_employees", + "aggregator": { + "type": "Sum" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + }, + "SA Average Employees": { + "name": "SA Average Employees", + "key_column": "SA_employees", + "aggregator": { + "type": "Average" + }, + "annotations": {}, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "BLS Unemployment Insurance Claims": { + "name": "BLS Unemployment Insurance Claims", + "table": { + "name": "bls_insurance_claims", + "primary_key": "week_ended", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "source_name": "U.S Department of Labor", + "source_link": "https://www.dol.gov/", + "topic": "Economy", + "subtopic": "Employment" + }, + "captions": {}, + "dimension_map": { + "State": { + "name": "State", + "source": "State", + "foreign_key": "fips_code", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Week Ended": { + "name": "Week Ended", + "default_hierarchy": "Week Ended", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "week_ended", + "hierarchy_map": { + "Week Ended": { + "name": "Week Ended", + "primary_key": "date", + "table": { + "name": "dim_shared_date", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Week Ended": { + "name": "Week Ended", + "depth": 1, + "key_column": "date", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "Week Previous": { + "name": "Week Previous", + "default_hierarchy": "Week Previous", + "annotations": {}, + "captions": {}, + "dim_type": "time", + "foreign_key": "reflecting_week_end", + "hierarchy_map": { + "Week Previous": { + "name": "Week Previous", + "primary_key": "date", + "table": { + "name": "dim_shared_date", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Week Previous": { + "name": "Week Previous", + "depth": 1, + "key_column": "date", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "str", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "Initial Claims": { + "name": "Initial Claims", + "key_column": "initial_claims", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "details": "Value" + }, + "captions": {}, + "submeasures": {} + }, + "Continued Claims": { + "name": "Continued Claims", + "key_column": "continued_claims", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "details": "Value" + }, + "captions": {}, + "submeasures": {} + }, + "Covered Employment": { + "name": "Covered Employment", + "key_column": "covered_employment", + "aggregator": { + "type": "Sum" + }, + "annotations": { + "details": "Value" + }, + "captions": {}, + "submeasures": {} + }, + "Insured Unemployment Rate": { + "name": "Insured Unemployment Rate", + "key_column": "insured_unemployment_rate", + "aggregator": { + "type": "Average" + }, + "annotations": { + "details": "Rate" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "onet_by_pums": { + "name": "onet_by_pums", + "table": { + "name": "onet_by_pums", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "http://www.onetonline.org/", + "source_name": "O*NET Online", + "source_description": "The O*Net Skills is a dataset containing detailed descriptions of the required and used skills for specific occupations. The O*Net dataset is sponsored by the United States Department of Labor.", + "dataset_name": "O*NET by PUMS Occupation", + "topic": "Economy", + "subtopic": "Skills" + }, + "captions": {}, + "dimension_map": { + "PUMS Occupation": { + "name": "PUMS Occupation", + "source": "PUMS Occupation", + "foreign_key": "pums_code", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Skill Element": { + "name": "Skill Element", + "source": "Skill Element", + "foreign_key": "element_id", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + }, + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "Year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + } + }, + "measure_map": { + "IM Value": { + "name": "IM Value", + "key_column": "im", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "NONE", + "details": "Not used for presentation", + "hide_in_ui": "true" + }, + "captions": {}, + "submeasures": {} + }, + "LV Value": { + "name": "LV Value", + "key_column": "lv", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "NONE", + "details": "Not used for presentation", + "hide_in_ui": "true" + }, + "captions": {}, + "submeasures": {} + }, + "Total Score": { + "name": "Total Score", + "key_column": "total_score", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "NONE", + "details": "Calculated by IM Value * LV Value" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + }, + "bls_growth_occupation": { + "name": "bls_growth_occupation", + "table": { + "name": "bls_growth_occupation", + "primary_key": "id", + "schema": null + }, + "acl": { + "public": true, + "rules": {} + }, + "annotations": { + "dataset_link": "https://www.bls.gov/bls/occupation.htm", + "source_name": "Bureau of Labor Statistics", + "source_description": "The Bureau of Labor Statistics (BLS) of the U.S. Department of Labor is the principal federal agency responsible for measuring labor market activity, working conditions, and price changes in the economy.", + "dataset_name": "BLS Statistics by Occupation, Growth", + "topic": "Economy", + "subtopic": "Occupation" + }, + "captions": {}, + "dimension_map": { + "Year": { + "name": "Year", + "default_hierarchy": "Year", + "annotations": { + "dim_type": "TIME" + }, + "captions": {}, + "dim_type": "time", + "foreign_key": "year", + "hierarchy_map": { + "Year": { + "name": "Year", + "primary_key": "year", + "table": null, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Year": { + "name": "Year", + "depth": 1, + "key_column": "year", + "annotations": {}, + "captions": {}, + "name_column_map": {}, + "key_type": "u16", + "property_map": {} + } + } + } + } + }, + "BLS Occupation Flat": { + "name": "BLS Occupation Flat", + "source": "BLS Occupation Flat", + "foreign_key": "bls_soc", + "annotations": {}, + "captions": {}, + "hierarchy_map": {} + } + }, + "measure_map": { + "Occupation Employment": { + "name": "Occupation Employment", + "key_column": "emp", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Employment", + "pre_aggregation_method": "SUM" + }, + "captions": {}, + "submeasures": {} + }, + "Occupation Employment Percent": { + "name": "Occupation Employment Percent", + "key_column": "emp_pct", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Percent", + "pre_aggregation_method": "Percent" + }, + "captions": {}, + "submeasures": {} + }, + "Occupation Employment Change": { + "name": "Occupation Employment Change", + "key_column": "emp_change", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Employment", + "pre_aggregation_method": "Change" + }, + "captions": {}, + "submeasures": {} + }, + "Occupation Employment Change Percent": { + "name": "Occupation Employment Change Percent", + "key_column": "emp_change_pct", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Percent", + "pre_aggregation_method": "Change Percent" + }, + "captions": {}, + "submeasures": {} + }, + "Occupation Employment Openings": { + "name": "Occupation Employment Openings", + "key_column": "occ_openings", + "aggregator": { + "type": "Average" + }, + "annotations": { + "aggregation_method": "NONE", + "units_of_measurement": "Openings", + "pre_aggregation_method": "SUM", + "details": "Projected Occupation Openings" + }, + "captions": {}, + "submeasures": {} + } + }, + "subset_table": false, + "visible": true + } + }, + "default_locale": "en", + "shared_dimension_map": { + "State Election": { + "name": "State Election", + "default_hierarchy": "State", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "State": { + "name": "State", + "primary_key": "geoid", + "table": { + "name": "states_shapes2017", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "geoid", + "annotations": { + "dim_type": "GEOGRAPHY" + }, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "State": { + "name": "State", + "default_hierarchy": "State", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "State": { + "name": "State", + "primary_key": "state_id", + "table": { + "name": "dim_shared_state", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "State": { + "name": "State", + "depth": 1, + "key_column": "state_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "state" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "PUMS Occupation": { + "name": "PUMS Occupation", + "default_hierarchy": "PUMS Occupation", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "PUMS Occupation": { + "name": "PUMS Occupation", + "primary_key": "id", + "table": { + "name": "pums_dims_soc", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Major Occupation Group": { + "name": "Major Occupation Group", + "depth": 1, + "key_column": "great_grandparent", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "great_grandparent_name" + }, + "key_type": "i64", + "property_map": {} + }, + "Minor Occupation Group": { + "name": "Minor Occupation Group", + "depth": 2, + "key_column": "grandparent", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "grandparent_name" + }, + "key_type": "i64", + "property_map": {} + }, + "Broad Occupation": { + "name": "Broad Occupation", + "depth": 3, + "key_column": "parent", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "parent_name" + }, + "key_type": "i64", + "property_map": {} + }, + "Detailed Occupation": { + "name": "Detailed Occupation", + "depth": 4, + "key_column": "id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "Skill Element": { + "name": "Skill Element", + "default_hierarchy": "Skill Element", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "Skill Element": { + "name": "Skill Element", + "primary_key": "element_id", + "table": { + "name": "dim_skill", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Skill Element Group": { + "name": "Skill Element Group", + "depth": 1, + "key_column": "element_group_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "element_group_name" + }, + "key_type": "str", + "property_map": {} + }, + "Skill Element": { + "name": "Skill Element", + "depth": 2, + "key_column": "element_id", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "element_name" + }, + "key_type": "str", + "property_map": {} + } + } + } + } + }, + "BLS Occupation Flat": { + "name": "BLS Occupation Flat", + "default_hierarchy": "BLS Occupation Flat", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "BLS Occupation Flat": { + "name": "BLS Occupation Flat", + "primary_key": "bls_code", + "table": { + "name": "dim_flat_bls_occupation", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Occupation": { + "name": "Occupation", + "depth": 1, + "key_column": "bls_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + }, + "CIP": { + "name": "CIP", + "default_hierarchy": "CIP", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "CIP": { + "name": "CIP", + "primary_key": "cip", + "table": { + "name": "ipeds_dims_cip", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "CIP2": { + "name": "CIP2", + "depth": 1, + "key_column": "cip2", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "cip2_name" + }, + "key_type": "i64", + "property_map": { + "CIP2 Full Name": { + "name": "CIP2 Full Name", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "cip2_name_long" + }, + "key_type": "i64" + }, + "CIP2 Description": { + "name": "CIP2 Description", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "cip2_description" + }, + "key_type": "i64" + } + } + }, + "CIP4": { + "name": "CIP4", + "depth": 2, + "key_column": "cip4", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "cip4_name" + }, + "key_type": "i64", + "property_map": { + "CIP4 Full Name": { + "name": "CIP4 Full Name", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "cip4_name_long" + }, + "key_type": "i64" + }, + "CIP4 Description": { + "name": "CIP4 Description", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "cip4_description" + }, + "key_type": "i64" + } + } + }, + "CIP6": { + "name": "CIP6", + "depth": 3, + "key_column": "cip", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "cip_name" + }, + "key_type": "i64", + "property_map": { + "CIP6 Full Name": { + "name": "CIP6 Full Name", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "cip_name_long" + }, + "key_type": "i64" + }, + "CIP6 Description": { + "name": "CIP6 Description", + "annotations": {}, + "captions": {}, + "key_column_map": { + "xx": "cip_description" + }, + "key_type": "i64" + } + } + } + } + } + } + }, + "BLS Industry Flat": { + "name": "BLS Industry Flat", + "default_hierarchy": "BLS Industry Flat", + "annotations": {}, + "captions": {}, + "dim_type": "standard", + "foreign_key": null, + "hierarchy_map": { + "BLS Industry Flat": { + "name": "BLS Industry Flat", + "primary_key": "bls_code", + "table": { + "name": "dim_flat_bls_industry", + "primary_key": "id", + "schema": null + }, + "annotations": {}, + "captions": {}, + "default_member": null, + "level_map": { + "Industry": { + "name": "Industry", + "depth": 1, + "key_column": "bls_code", + "annotations": {}, + "captions": {}, + "name_column_map": { + "xx": "name" + }, + "key_type": "i64", + "property_map": {} + } + } + } + } + } + }, + "shared_table_map": {} +} \ No newline at end of file diff --git a/api/src/utils/api_data_request/api.py b/api/src/utils/api_data_request/api.py new file mode 100644 index 0000000..eec900e --- /dev/null +++ b/api/src/utils/api_data_request/api.py @@ -0,0 +1,164 @@ +import openai +import os +import pandas as pd +import requests +import json + +from os import getenv +from dotenv import load_dotenv +from src.utils.table_selection.table_details import * +from src.utils.preprocessors.text import * +from src.utils.api_data_request.similarity_search import * + +load_dotenv() + +# environment initialization +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +# drilldown initialization +OPENAI_KEY = getenv("OPENAI_KEY") +openai.api_key = OPENAI_KEY + +TESSERACT_API = getenv("TESSERACT_API") +MONDRIAN_API = getenv('MONDRIAN_API') + +class ApiBuilder: + + def __init__(self, base_url): + self.base_url = base_url + self.cube = None + self.cuts = {} + self.drilldowns = set() + self.measures = set() + self.limit = None + self.sort = None + self.locale = None + + def add_cube(self, cube): + self.cube = cube + + def add_cut(self, key, value): + if key not in self.cuts: + self.cuts[key] = set() + self.cuts[key].add(str(value)) + + def add_drilldown(self, drilldown): + if isinstance(drilldown, list): + for var in drilldown: + self.drilldowns.add(var) + else: + self.drilldowns.add(drilldown) + + def add_measure(self, measure): + if isinstance(measure, list): + self.measures.update(measure) + else: + self.measures.add(measure) + + def set_limit(self, limit): + self.limit = limit + + def set_sort(self, drilldown, order): + self.sort = f"{drilldown}.{order}" + + def set_locale(self, locale): + self.locale = locale + + def build_url(self): + query_params = [] + + if self.cube: + query_params.append(f"cube={self.cube}") + for key, values in self.cuts.items(): + query_params.append(f"{key}={','.join(values)}") + if self.drilldowns: + query_params.append("drilldowns=" + ",".join(self.drilldowns)) + if self.measures: + query_params.append("measures=" + ",".join(self.measures)) + if self.limit is not None: + query_params.append(f"limit={self.limit}") + if self.sort: + query_params.append(f"sort={self.sort}") + if self.locale: + query_params.append(f"locale={self.locale}") + + query_string = "&".join(query_params) + return f"{self.base_url}{query_string}" if query_params else self.base_url + + def fetch_data(self): + """ + Makes an API request to the constructed URL and returns the JSON data and a DataFrame. + """ + try: + r = requests.get(self.build_url()) + r.raise_for_status() + + if 'data' in r.json(): + json_data = r.json()['data'] + df = pd.DataFrame.from_dict(json_data) + return json_data, df, "" + else: + return {}, pd.DataFrame(), "No data key in response." + + except Exception as e: + return {}, pd.DataFrame(), f"An error occurred: {str(e)}" + + def __str__(self): + return self.build_url() + + +def cuts_processing(cuts, table, table_manager, api): + + for i in range(len(cuts)): + var = cuts[i].split('=')[0].strip() + cut = cuts[i].split('=')[1].strip() + + var_levels = get_drilldown_levels(table_manager, table.name, var) + + if var == "Year" or var == "Month" or var == "Quarter" or var == "Month and Year" or var == "Time": + api.add_cut(var, cut) + else: + drilldown_id, drilldown_name, s = get_similar_content(cut, table.name, var_levels) + + if drilldown_name != var: + api.drilldowns.discard(var) + api.add_drilldown(drilldown_name) + + api.add_cut(drilldown_name, drilldown_id) + + +def api_build(table, table_manager, drilldowns, measures, cuts, limit = ""): + """ + Receives the the drilldowns, measures and filters obtained from the LLM. + """ + base = "Tesseract" + #table.api + + if base == "Mondrian": base = MONDRIAN_API + else: base = TESSERACT_API + "data.jsonrecords?" + + api = ApiBuilder(base) + + api.add_cube(table.name) + api.add_drilldown(drilldowns) + api.add_measure(measures) + + cuts_processing(cuts, table, table_manager, api) + + return api + + +def api_request(url): + try: + r = requests.get(url) + r.raise_for_status() + + if 'data' in r.json(): + json_data = r.json()['data'] + df = pd.DataFrame.from_dict(r.json()['data']) + return json_data, df, "" + + except: + json_data = json.loads('{}') + df = pd.DataFrame() + return json_data, df, "No data found." \ No newline at end of file diff --git a/api/src/utils/api_data_request/api_generator.py b/api/src/utils/api_data_request/api_generator.py index 05af322..0c5aeeb 100644 --- a/api/src/utils/api_data_request/api_generator.py +++ b/api/src/utils/api_data_request/api_generator.py @@ -10,6 +10,7 @@ from src.utils.table_selection.table_details import * from src.utils.preprocessors.text import * from src.utils.api_data_request.similarity_search import * +from src.utils.api_data_request.api import * load_dotenv() @@ -24,53 +25,70 @@ MONDRIAN_API = getenv('MONDRIAN_API') -def get_api_components_messages(table): +def get_api_components_messages(table, model_author, natural_language_query = ""): response_part = """ - {{ + { "variables": "", "measures": "", "filters": "" - }} + } """ - message = f""" - You are an expert data scientist working with data organized in a multidimensional format, such as in OLAP cubes. - You are given the following JSON containing the information of a cube that contains data to answer a user's question. - ---------------------\n - {table.columns_description()} - ---------------------\n - - Your goal is to identify the variables, measures and filters needed in order to retrieve the data from the cube through an API. - You should respond in JSON format with your answer separated into the following fields:\n - - \"variables\" which is a list of strings that contain the variables.\n - \"measures\" which is a list of strings that contain the relevant measures.\n - \"filters\" which is a list of strings that contain the filters in the form of 'variable = filtered_value'.\n - - in your answer, written in markdown format, provide the following information:\n - - \n - - the markdown formatted like this:\n - ``` - {response_part} - ``` - Provide only the list of variables, measures and filters, and nothing else after.\n - A few rules to take into consideration:\n - - You cannot apply filters to different variables with the same parent dimension. Choose only one (the most relevant or most granular)\n - - Assume the latest year to be 2023.\n - - For cases where the query requires to filter by a certain range of years or months, please specify all of them separately. - """ + if(model_author == "openai"): + + message = f""" +You are an expert data scientist working with data organized in a multidimensional format, such as in OLAP cubes. +You are given the following JSON containing the dimensions and measures of a cube that contains data to answer a user's question. +---------------------\n +{table.columns_description()} +---------------------\n +Your goal is to identify the variables, measures and filters needed in order to retrieve the data from the cube through an API. +The variables available correspond to the values in the 'levels' key. +You should respond in JSON format with your answer separated into the following fields:\n + + \"variables\" which is a list of strings that contain the variables.\n + \"measures\" which is a list of strings that contain the relevant measures.\n + \"filters\" which is a list of strings that contain the filters in the form of 'variable = filtered_value'.\n + +in your answer, provide the markdown formatted like this:\n +``` +{response_part} +``` +Provide only the list of variables, measures and filters, and nothing else before or after.\n +A few rules to take into consideration:\n +- You cannot apply filters to different variables with the same parent dimension. Choose only one (the most relevant or most granular)\n +- For cases where the query requires to filter by a certain range of years or months, please specify all of them separately. +""" + + else: + + message = f""" + +Below you can find the metadata of the cube: +---------------------\n +{table.columns_description()} +---------------------\n + +A few rules to take into consideration:\n +- You cannot apply filters to different variables with the same parent dimension. Choose only one (the most relevant or most granular)\n +- For cases where the query requires to filter by a certain range of years or months, please specify all of them separately. + +This is my question: +{natural_language_query} +""" return message + def get_model_author(model): """ - Identify Model Author for Model requestes + Identify Model Author for Model requests """ - # List of posible nodels + # List of possible models models = { - "openai": ["gpt-3.5-turbo", "gpt-4","gpt-4-0125-preview", "gpt-4-1106-preview"], - "llama": ["llama2"] + "openai": ["gpt-3.5-turbo", "gpt-4", "gpt-4-0125-preview", "gpt-4-1106-preview"], + "llama": ["llama2", "mistral", "codellama", "mixtral", "api_params"] } if model in models.get("openai"): @@ -82,14 +100,15 @@ def get_model_author(model): return author + def get_api_params_from_lm(natural_language_query, table = None, model="gpt-4", top_matches=False): """ - Identify API parameters to retrieve the data + Identify API parameters to retrieve the data using OpenAI models or Llama. """ + start_time = time.time() model_author = get_model_author(model) - print('here', model, model_author) - content = get_api_components_messages(table) + content = get_api_components_messages(table, model_author, natural_language_query) # logic for openai models if model_author == "openai": @@ -109,9 +128,9 @@ def get_api_params_from_lm(natural_language_query, table = None, model="gpt-4", while attempts < max_attempts: try: response = openai.ChatCompletion.create( - model=model, - messages=messages, - temperature=0 + model = model, + messages = messages, + temperature = 0 ) except openai.error.Timeout as e: print(f"OpenAI API request timed out (attempt {attempts + 1}): {e}") @@ -127,6 +146,8 @@ def get_api_params_from_lm(natural_language_query, table = None, model="gpt-4", time.sleep(1) output_text = response['choices'][0]['message']['content'] + end_time = time.time() + print("Duration:", end_time - start_time, "seconds") print("\nChatGPT response:", output_text) params = extract_text_from_markdown_triple_backticks(output_text) print("\nParameters:", params) @@ -134,87 +155,29 @@ def get_api_params_from_lm(natural_language_query, table = None, model="gpt-4", variables = json.loads(params).get("variables") measures = json.loads(params).get("measures") cuts = json.loads(params).get("filters") + elif model_author == "llama": url = "https://caleuche-ollama.datawheel.us/api/generate" + print(content) payload = { - "model": "llama2", + "model": model, "prompt": content } response = requests.post(url, json=payload) - + end_time = time.time() + print("Duration:", end_time - start_time, "seconds") print(response.text) - else: - # logics: ask for model on the list, or use a default one - status = "bad status" - - return variables, measures, cuts - - -def cuts_processing(cuts, table, table_manager, drilldowns): - updated_cuts = {} - - for i in range(len(cuts)): - var = cuts[i].split('=')[0].strip() - cut = cuts[i].split('=')[1].strip() - - var_levels = get_drilldown_levels(table_manager, table.name, var) - if var == "Year" or var == "Month" or var == "Quarter" or var == "Month and Year": - if var in updated_cuts: - updated_cuts[var].append(cut) - else: - updated_cuts[var] = [cut] - else: - drilldown_id, drilldown_name, s = get_similar_content(cut, table.name, var_levels) - - if drilldown_name != var: - drilldowns.remove(var) - if drilldown_name not in drilldowns: - drilldowns.append(drilldown_name) + response = parse_response(response.text) + print(response) + params = extract_text_from_markdown_triple_backticks(response) + variables = json.loads(params).get("variables") + measures = json.loads(params).get("measures") + cuts = json.loads(params).get("filters") - if drilldown_name in updated_cuts: - updated_cuts[drilldown_name].append(drilldown_id) - else: - updated_cuts[drilldown_name] = [drilldown_id] - - api_params = '&' + '&'.join([f"{key}={','.join(values)}" for key, values in updated_cuts.items()]) - - return api_params, drilldowns - - -def api_build(table, table_manager, drilldowns, measures, cuts, limit = ""): - base = table.api - - for i in range(len(drilldowns)): - drilldowns[i] = clean_string(drilldowns[i]) - - for i in range(len(measures)): - measures[i] = clean_string(measures[i]) - - measures_str = "&measures=" + ','.join(measures) - cuts_str, drilldowns = cuts_processing(cuts, table, table_manager, drilldowns) - drilldowns_str = "&drilldowns=" + ','.join(drilldowns) - - if base == "Mondrian": base = MONDRIAN_API - else: base = TESSERACT_API + "data.jsonrecords?cube=" + table.name - - url = base + drilldowns_str + measures_str + cuts_str - - return url - - -def api_request(url): - try: - r = requests.get(url) - r.raise_for_status() - - if 'data' in r.json(): - json_data = r.json()['data'] - df = pd.DataFrame.from_dict(r.json()['data']) - return json_data, df, "" + else: + # logic: ask for model on the list, or use a default one + status = "bad status" - except: - json_data = json.loads('{}') - df = pd.DataFrame() - return json_data, df, "No data found." \ No newline at end of file + return variables, measures, cuts \ No newline at end of file diff --git a/api/src/utils/app.py b/api/src/utils/app.py index 9d8406b..c9ed3f5 100644 --- a/api/src/utils/app.py +++ b/api/src/utils/app.py @@ -10,27 +10,25 @@ def get_api(query, TABLES_PATH): start_time = time.time() manager = TableManager(TABLES_PATH) - table = request_tables_to_lm_from_db(query, manager) + variables, measures, cuts = get_api_params_from_lm(query, table, model = 'gpt-4') - variables, measures, cuts = get_api_params_from_lm(query, table, model = 'gpt-4-1106-preview') - - api_url = api_build(table, manager, variables, measures, cuts) - + api = api_build(table, manager, variables, measures, cuts) + api_url = api.build_url() print("API:", api_url) - data, df, response = api_request(api_url) + data, df, response = api.fetch_data() end_time = time.time() duration = end_time - start_time if (response == "No data found." or df.empty): - log_apicall(query, "", response, "", "", "", table, duration) - return api_url, data, response else: response = agent_answer(df, query) log_apicall(query, api_url, response, variables, measures, cuts, table, duration) - - return api_url, data, response \ No newline at end of file + return api_url, data, response + +TABLES_PATH = getenv('TABLES_PATH') +get_api('How much did the CPI of fresh fruits change between 2019 and 2021', TABLES_PATH) \ No newline at end of file diff --git a/api/src/utils/preprocessors/text.py b/api/src/utils/preprocessors/text.py index 9cbb4ba..6bbf2f1 100644 --- a/api/src/utils/preprocessors/text.py +++ b/api/src/utils/preprocessors/text.py @@ -1,3 +1,4 @@ +import json import re import regex @@ -61,4 +62,42 @@ def extract_text_from_markdown_triple_backticks(raw_str): json_content = match.group(0) return json_content else: - return "" \ No newline at end of file + return "" + + +def parse_to_json(concatenated_str): + """ + Parses a concatenated string containing multiple JSON objects into a list of parsed JSON dictionaries. + """ + json_strs = concatenated_str.split("\n") + parsed_json_list = [] + + for json_str in json_strs: + if json_str.strip(): + parsed_json_list.append(json.loads(json_str)) + + return parsed_json_list + + +def parse_response(json_data): + """ + Parses LLama response to a continuous string. + """ + json_data = parse_to_json(json_data) + parsed_response = "" + + for item in json_data: + if "response" in item: + parsed_response += item["response"] + + return parsed_response + + +def clean_api_url(input_string): + characters_to_remove = "\"'`;" + + cleaned_string = input_string + for char in characters_to_remove: + cleaned_string = cleaned_string.replace(char, '') + + return cleaned_string \ No newline at end of file diff --git a/api/src/utils/table_selection/table_details.py b/api/src/utils/table_selection/table_details.py index bccb58c..4e7d1f8 100644 --- a/api/src/utils/table_selection/table_details.py +++ b/api/src/utils/table_selection/table_details.py @@ -10,41 +10,52 @@ def __init__(self, table_data): self.api = table_data.get('api') self.description = table_data.get('description') self.measures = table_data.get('measures', []) - self.variables = table_data.get('variables', []) + self.dimensions = table_data.get('dimensions', []) def get_measures_description(self, measure_name=None): if measure_name: for measure in self.measures: if measure['name'] == measure_name: - return f"{measure['name']}: {measure.get('description', 'No description available')}" + return f"{measure['name']} ({measure.get('description', 'No description available')})\n" return f"No description available for measure: {measure_name}" - else: return [f"{measure['name']}: {measure.get('description', 'No description available')}" for measure in self.measures] - - def get_variables_description(self, variable_name=None): - if variable_name: - for variable in self.variables: - if variable['name'] == variable_name: - return f"{variable['name']}: {variable.get('description', 'No description available')}" - return f"No description available for variable: {variable_name}" + else: return [f"{measure['name']} ({measure.get('description', 'No description available')})\n" for measure in self.measures] + + def get_dimensions_description(self, dimension_name=None): + if dimension_name: + for dimension in self.dimensions: + if dimension['name'] == dimension_name: + return f"{dimension['name']} ({dimension.get('description', 'No description available')})\n" + return f"No description available for dimension: {dimension_name}" - else: return [f"{variable['name']}: {variable.get('description', 'No description available')}" for variable in self.variables] + else: return [f"{dimension['name']} ({dimension.get('description', 'No description available')})\n" for dimension in self.dimensions] + + def get_dimension_hierarchies(self, dimension_name): + for dimension in self.dimensions: + if dimension['name'] == dimension_name and "hierarchies" in dimension: + return dimension["hierarchies"][0]["levels"] + + for dimension in self.dimensions: + for hierarchy in dimension.get("hierarchies", []): + if hierarchy['name'] == dimension_name: + return hierarchy["levels"] + + for dimension in self.dimensions: + for hierarchy in dimension.get("hierarchies", []): + if dimension_name in hierarchy.get("levels", []): + return hierarchy["levels"] - def get_variable_hierarchies(self, variable_name): - for variable in self.variables: - if variable['name'] == variable_name and "hierarchies" in variable: - return variable["hierarchies"] return None def schema_description(self): - dimensions_str = ", ".join([f"{var['name']} ({var.get('description', 'No description')})" for var in self.variables]) + dimensions_str = ", ".join([f"{var['name']} ({var.get('description', 'No description')})" for var in self.dimensions]) measures_str = ", ".join([f"{measure['name']} ({measure.get('description', 'No description')})" for measure in self.measures]) return f"Table Name: {self.name}\nDescription: {self.description}\nDimensions: {dimensions_str}\nMeasures: {measures_str}\n" def columns_description(self): dimensions_str_list = [ - f"{dimension['name']} ({dimension.get('description', 'No description')}) [Parent dimension: {dimension.get('parent dimension', 'N/A')}];\n" - for dimension in self.variables + f"{dimension['name']} ({dimension.get('description', 'No description')}) [Levels: {dimension['hierarchies'][0]['levels']}];\n" + for dimension in self.dimensions ] measures_str_list = [ @@ -55,13 +66,13 @@ def columns_description(self): dimensions_str = ''.join(dimensions_str_list) measures_str = ''.join(measures_str_list) - columns_str = f"Table Name: {self.name}\n" + "Variables:\n" + dimensions_str + "Measures:\n" + measures_str + columns_str = f"Table Name: {self.name}\n" + "Dimensions:\n" + dimensions_str + "\nMeasures:\n" + measures_str return columns_str def __str__(self): measures_str = ", ".join(self.get_measures_description()) - variables_str = ", ".join(self.get_variables_description()) - return f"Table Name: {self.name}\nDescription: {self.description}\nMeasures: {measures_str}\nVariables: {variables_str}\n" + dimensions_str = ", ".join(self.get_dimensions_description()) + return f"Table Name: {self.name}\nDescription: {self.description}\nMeasures:\n {measures_str}\nDimensions:\n {dimensions_str}\n" class TableManager: @@ -101,9 +112,9 @@ def get_table_schemas(self, table_names: List[str] = None) -> str: return "\n\n".join(tables_str_list) -def get_drilldown_levels(manager, table_name, variable_name): +def get_drilldown_levels(manager, table_name, dimension_name): table = manager.get_table(table_name) if table: - return table.get_variable_hierarchies(variable_name) + return table.get_dimension_hierarchies(dimension_name) else: return None \ No newline at end of file From 3334910620f7b7b4f44f3bd24bdc6150bfed5870 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Tue, 12 Mar 2024 12:56:34 -0300 Subject: [PATCH 02/29] Remove unused function and add jupyter notebook --- api/api_class.ipynb | 117 ++++++++++++++++++++++++++ api/src/utils/api_data_request/api.py | 23 +---- 2 files changed, 120 insertions(+), 20 deletions(-) diff --git a/api/api_class.ipynb b/api/api_class.ipynb index e69de29..bb8814c 100644 --- a/api/api_class.ipynb +++ b/api/api_class.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext dotenv\n", + "%dotenv\n", + "\n", + "from src.utils.api_data_request.api import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "jsondata = {\n", + " \"variables\": [\"Time\", \"Products or Services\"],\n", + " \"measures\": [\"Consumer Price Index\", \"Percent Change\"],\n", + " \"filters\": [\"Time = 2019\", \"Time = 2020\", \"Time = 2021\", \"Products or Services = Fresh Fruits\"]\n", + "}\n", + "\n", + "v = jsondata[\"variables\"]\n", + "m = jsondata[\"measures\"]\n", + "c = jsondata[\"filters\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from src.utils.table_selection.table_selector import *\n", + "from src.utils.table_selection.table_details import *\n", + "\n", + "TABLES_PATH = getenv('TABLES_PATH')\n", + "manager = TableManager(TABLES_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "table = manager.get_table(\"Consumer Price Index - CPI\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://api-dev.datausa.io/tesseract/data.jsonrecords?cube=Consumer Price Index - CPI&Time=2021,2019,2020&Level 5.5=501010407009&drilldowns=Level 5.5,Time&measures=Consumer Price Index,Percent Change\n" + ] + }, + { + "data": { + "text/plain": [ + "'https://api-dev.datausa.io/tesseract/data.jsonrecords?cube=Consumer Price Index - CPI&Time=2021,2019,2020&Level 5.5=501010407009&drilldowns=Level 5.5,Time&measures=Consumer Price Index,Percent Change'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "api_build(table, manager, v, m, c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ai-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/src/utils/api_data_request/api.py b/api/src/utils/api_data_request/api.py index eec900e..d44d8a2 100644 --- a/api/src/utils/api_data_request/api.py +++ b/api/src/utils/api_data_request/api.py @@ -129,10 +129,9 @@ def cuts_processing(cuts, table, table_manager, api): def api_build(table, table_manager, drilldowns, measures, cuts, limit = ""): """ - Receives the the drilldowns, measures and filters obtained from the LLM. + Receives the drilldowns, measures and filters obtained from the LLM and adds them as attributes to the api instance. """ - base = "Tesseract" - #table.api + base = table.api if base == "Mondrian": base = MONDRIAN_API else: base = TESSERACT_API + "data.jsonrecords?" @@ -145,20 +144,4 @@ def api_build(table, table_manager, drilldowns, measures, cuts, limit = ""): cuts_processing(cuts, table, table_manager, api) - return api - - -def api_request(url): - try: - r = requests.get(url) - r.raise_for_status() - - if 'data' in r.json(): - json_data = r.json()['data'] - df = pd.DataFrame.from_dict(r.json()['data']) - return json_data, df, "" - - except: - json_data = json.loads('{}') - df = pd.DataFrame() - return json_data, df, "No data found." \ No newline at end of file + return api \ No newline at end of file From dd8a8a1417a98aeb6b1b03b9d4f7f92a1e354f28 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Tue, 12 Mar 2024 16:51:18 -0300 Subject: [PATCH 03/29] Add script to map tesseract schema to custom json --- api/src/utils/{ => helpers}/cubes_to_db.py | 0 .../utils/{ => helpers}/drilldowns_to_db.py | 0 .../utils/helpers}/tesseract_schema.json | 0 .../utils/helpers/tesseract_schema_mapping.py | 72 +++++++++++++++++++ 4 files changed, 72 insertions(+) rename api/src/utils/{ => helpers}/cubes_to_db.py (100%) rename api/src/utils/{ => helpers}/drilldowns_to_db.py (100%) rename api/{data => src/utils/helpers}/tesseract_schema.json (100%) create mode 100644 api/src/utils/helpers/tesseract_schema_mapping.py diff --git a/api/src/utils/cubes_to_db.py b/api/src/utils/helpers/cubes_to_db.py similarity index 100% rename from api/src/utils/cubes_to_db.py rename to api/src/utils/helpers/cubes_to_db.py diff --git a/api/src/utils/drilldowns_to_db.py b/api/src/utils/helpers/drilldowns_to_db.py similarity index 100% rename from api/src/utils/drilldowns_to_db.py rename to api/src/utils/helpers/drilldowns_to_db.py diff --git a/api/data/tesseract_schema.json b/api/src/utils/helpers/tesseract_schema.json similarity index 100% rename from api/data/tesseract_schema.json rename to api/src/utils/helpers/tesseract_schema.json diff --git a/api/src/utils/helpers/tesseract_schema_mapping.py b/api/src/utils/helpers/tesseract_schema_mapping.py new file mode 100644 index 0000000..fb3aefe --- /dev/null +++ b/api/src/utils/helpers/tesseract_schema_mapping.py @@ -0,0 +1,72 @@ +import json +import sys + +def tesseract_schema_mapping(input_file, output_file): + + with open(input_file, 'r') as f: + input_json = json.load(f) + + tables = [] + + cube_map = input_json.get("cube_map", {}) + for cube_name, cube_data in cube_map.items(): + table_data = cube_data.get("table", {}) + dimensions_data = cube_data.get("dimension_map", {}) + measures_data = cube_data.get("measure_map", {}) + + table = { + "name": cube_name, + "api": "Tesseract", + "description": f"Table `{cube_name}` has data on {', '.join(measures_data.keys())}.", + "measures": [], + "dimensions": [] + } + + for measure_name, measure_data in measures_data.items(): + measure = { + "name": measure_name, + "description": f"Contains the {measure_name.lower()} for {cube_name.replace('_', ' ')}" + } + table["measures"].append(measure) + + for dimension_name, dimension_data in dimensions_data.items(): + dimension = { + "name": dimension_name, + "description": f"{dimension_name.lower()} dimension of the data.", + "hierarchies": [] + } + + hierarchy_map = dimension_data.get("hierarchy_map", {}) + for hierarchy_name, hierarchy_data in hierarchy_map.items(): + levels = [] + level_map = hierarchy_data.get("level_map", {}) + for level_name, level_data in level_map.items(): + levels.append(level_name) + + hierarchy = { + "name": hierarchy_name, + "levels": levels + } + dimension["hierarchies"].append(hierarchy) + + table["dimensions"].append(dimension) + + tables.append(table) + + output_json = {"tables": tables} + + with open(output_file, 'w') as f: + json.dump(output_json, f, indent=4) + + return None + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python tesseract_schema_mapping.py ") + sys.exit(1) + + input_file = sys.argv[1] + output_file = sys.argv[2] + + tesseract_schema_mapping(input_file, output_file) \ No newline at end of file From 8cabb49bebb22eb9b07442ff27e488bcbf4c1a0a Mon Sep 17 00:00:00 2001 From: Alexandra Date: Tue, 12 Mar 2024 18:00:21 -0300 Subject: [PATCH 04/29] Update README --- README.md | 67 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index b534f96..756f27c 100644 --- a/README.md +++ b/README.md @@ -9,22 +9,25 @@ This repository contains scripts for a chatbot that leverages artificial intelli - Also contains `tables.json` which contains available cubes, with their descriptions, column names, and relevant details. -### 2. **`utils/`** +### 2. **`src/utils/`** - Houses all the main scripts to run the chatbot. - **Subfolders:** 1. **`api_data_request/`** - Core scripts responsible for constructing the API URL. Contains functions for processing query cuts and matching values with their respective IDs. - - 2. **`preprocessors/`** - - Contains scripts that preprocess text (or any other data type as needed). - - 3. **`table_selection/`** - - All scripts needed to lookup the relevant table/cube that contains the data needed to answer a user's query. - 4. **`data_analysis/`** + 2. **`data_analysis/`** - Contains scripts used for data analysis (mainly using [LangChain](https://python.langchain.com/docs/get_started/introduction)). + 3. **`helpers/`** + - Stores scripts to ingest cubes and drilldowns into a database. Also contains a script to map the tesseract schema to the custom `tables.json` format needed to run the chat. + + 4. **`preprocessors/`** + - Contains scripts that preprocess text (or any other data type as needed). + + 5. **`table_selection/`** + - All scripts needed to lookup and manage the relevant cube that contains the data needed to answer the user's query. + ## General Workflow @@ -41,6 +44,9 @@ This repository contains scripts for a chatbot that leverages artificial intelli - **Option 3: request_tables_to_lm_from_db()** - Hybrid approach that obtains the top N matches from the database using embeddings. It then asks the LM to choose between these N tables. + - **Option 4: [in progress]** + - Will receive the table name from the wrapper. + 2. All the above functions return the name of the most relevant table. The app currenty works with Option 3. ### 2. API URL Generator & Data Request @@ -60,11 +66,13 @@ This repository contains scripts for a chatbot that leverages artificial intelli 3. Extracts the JSON from the LM's output string. - 4. For the cuts, a similarity search is done over the corresponding dimension members to extract their ids. + 4. Instantiates an ApiBuilder object and sets the variables, measures, and cuts provided by the LLM as attributes using the class methods. + + 4. For the cuts, a similarity search is done over the corresponding dimension members of the cube to extract their ids from the database (with the `cuts_processing()` function). - 5. The API URL (for Mondrian or Tesseract) is built using the processed cuts, drilldowns and measures obtained from previous steps. + 5. The API URL (for Mondrian or Tesseract) is built using the processed cuts, drilldowns and measures obtained from previous steps by running the `build_url()` method. - 6. The data is retrieved from the API and stored in a pandas dataframe. + 6. The data is retrieved from the API using the `fetch_data()` method and stored in a pandas dataframe. ### 3. Data Analysis/Processing @@ -81,7 +89,7 @@ Currently, the cubes available to be queried by the chatbot are: - Data_USA_House_election - [in progress] pums_5 -In order to add cubes, the steps are: +In order to add one cube, the steps are: 1. Add the cube to the tables.json file. The following fields must be filled: - name @@ -94,27 +102,46 @@ In order to add cubes, the steps are: "description": "value in millions of dollars of a certain shipment." } ``` - - variables - - Add each level separately, filling the following fields for each: + - dimensions + - Add each hierarchy separately, filling the following fields for each: ```json { - "name": "State", - "description": "US states", - "parent dimension": "Geography", - "hierarchies": ["State", "County"] + "name": "Time", + "description": "Periodicity of the data (monthly or annual).", + "hierarchies": [ + { + "name": "Month and Year", + "description": "'Month and Year' has the format YYYYMM (example March of 2015 is 201503)", + "levels": [ + "Year", + "Month and Year" + ] + } + ] } ``` - 2. Add the cube to the database (datausa_tables.cubes), filling the following columns (you can use the `cubes_to_db.py` script): + 2. Add the cube to the database (**datausa_tables.cubes**), filling the following columns (you can use the `cubes_to_db.py` script): - table_name - table_description - embedding (embedding of the table's description is represented as a 384-dimensional vector, derived using the `SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')` model) - 3. Add drilldown members & ids to the db (datausa_drilldowns.drilldowns) + 3. Add drilldown members & ids to the db (**datausa_drilldowns.drilldowns**) - This process can be initiated by executing the `drilldowns_to_db.py` script. During execution, the code will prompt for the API URL to fetch the drilldown members and IDs. Then, it will request the measure name in order to remove it from the dataframe before loading the data to the database. - The script then appends a column containing embeddings generated from the drilldown names using the same embedding model mentioned before. - This process needs to be repeated for each drilldown level within the cube or those required for making cuts. Time variables don't need to be loaded into the database. +### [For future projects] In progress... + +To add all the cubes of a project automatically, they can be mapped from the tesseract schema json to the custom format needed in the app. To do this follow these steps: + + 1. Retrieve the tesseract schema json (for example [this one](https://api-dev.datausa.io/tesseract/debug/schema)) and store it in the **`/helpers`** folder. + + 2. Run the following command in the terminal (replacing the file names): + ``` + python tesseract_schema_mapping.py + ``` + # API beeing served by FastAPI From 9bacf9a407e55f0ba5c8b05c75c7f6bdf8a821e4 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Tue, 12 Mar 2024 18:02:50 -0300 Subject: [PATCH 05/29] Update README --- README.md | 69 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index b534f96..0c8eeeb 100644 --- a/README.md +++ b/README.md @@ -9,22 +9,25 @@ This repository contains scripts for a chatbot that leverages artificial intelli - Also contains `tables.json` which contains available cubes, with their descriptions, column names, and relevant details. -### 2. **`utils/`** +### 2. **`src/utils/`** - Houses all the main scripts to run the chatbot. - **Subfolders:** 1. **`api_data_request/`** - Core scripts responsible for constructing the API URL. Contains functions for processing query cuts and matching values with their respective IDs. - - 2. **`preprocessors/`** - - Contains scripts that preprocess text (or any other data type as needed). - - 3. **`table_selection/`** - - All scripts needed to lookup the relevant table/cube that contains the data needed to answer a user's query. - 4. **`data_analysis/`** + 2. **`data_analysis/`** - Contains scripts used for data analysis (mainly using [LangChain](https://python.langchain.com/docs/get_started/introduction)). + 3. **`helpers/`** + - Stores scripts to ingest cubes and drilldowns into a database. Also contains a script to map the tesseract schema to the custom `tables.json` format needed to run the chat. + + 4. **`preprocessors/`** + - Contains scripts that preprocess text (or any other data type as needed). + + 5. **`table_selection/`** + - All scripts needed to lookup and manage the relevant cube that contains the data needed to answer the user's query. + ## General Workflow @@ -41,6 +44,9 @@ This repository contains scripts for a chatbot that leverages artificial intelli - **Option 3: request_tables_to_lm_from_db()** - Hybrid approach that obtains the top N matches from the database using embeddings. It then asks the LM to choose between these N tables. + - **Option 4: [in progress]** + - Will receive the table name from the wrapper. + 2. All the above functions return the name of the most relevant table. The app currenty works with Option 3. ### 2. API URL Generator & Data Request @@ -60,11 +66,13 @@ This repository contains scripts for a chatbot that leverages artificial intelli 3. Extracts the JSON from the LM's output string. - 4. For the cuts, a similarity search is done over the corresponding dimension members to extract their ids. + 4. Instantiates an ApiBuilder object and sets the variables, measures, and cuts provided by the LLM as attributes using the class methods. + + 4. For the cuts, a similarity search is done over the corresponding dimension members of the cube to extract their ids from the database (with the `cuts_processing()` function). - 5. The API URL (for Mondrian or Tesseract) is built using the processed cuts, drilldowns and measures obtained from previous steps. + 5. The API URL (for Mondrian or Tesseract) is built using the processed cuts, drilldowns and measures obtained from previous steps by running the `build_url()` method. - 6. The data is retrieved from the API and stored in a pandas dataframe. + 6. The data is retrieved from the API using the `fetch_data()` method and stored in a pandas dataframe. ### 3. Data Analysis/Processing @@ -81,9 +89,9 @@ Currently, the cubes available to be queried by the chatbot are: - Data_USA_House_election - [in progress] pums_5 -In order to add cubes, the steps are: +In order to add one cube, the steps are: - 1. Add the cube to the tables.json file. The following fields must be filled: + 1. Add the cube to the `tables.json` file. The following fields must be filled: - name - api (Tesseract or Mondrian) - description @@ -94,27 +102,46 @@ In order to add cubes, the steps are: "description": "value in millions of dollars of a certain shipment." } ``` - - variables - - Add each level separately, filling the following fields for each: + - dimensions + - Add each hierarchy separately, filling the following fields for each: ```json { - "name": "State", - "description": "US states", - "parent dimension": "Geography", - "hierarchies": ["State", "County"] + "name": "Time", + "description": "Periodicity of the data (monthly or annual).", + "hierarchies": [ + { + "name": "Month and Year", + "description": "'Month and Year' has the format YYYYMM (example March of 2015 is 201503)", + "levels": [ + "Year", + "Month and Year" + ] + } + ] } ``` - 2. Add the cube to the database (datausa_tables.cubes), filling the following columns (you can use the `cubes_to_db.py` script): + 2. Add the cube to the database (**datausa_tables.cubes**), filling the following columns (you can use the `cubes_to_db.py` script): - table_name - table_description - embedding (embedding of the table's description is represented as a 384-dimensional vector, derived using the `SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')` model) - 3. Add drilldown members & ids to the db (datausa_drilldowns.drilldowns) + 3. Add drilldown members & ids to the db (**datausa_drilldowns.drilldowns**) - This process can be initiated by executing the `drilldowns_to_db.py` script. During execution, the code will prompt for the API URL to fetch the drilldown members and IDs. Then, it will request the measure name in order to remove it from the dataframe before loading the data to the database. - The script then appends a column containing embeddings generated from the drilldown names using the same embedding model mentioned before. - This process needs to be repeated for each drilldown level within the cube or those required for making cuts. Time variables don't need to be loaded into the database. +### [For future projects] In progress... + +To add all the cubes of a project automatically, they can be mapped from the tesseract schema json to the custom format needed in the app. To do this follow these steps: + + 1. Retrieve the tesseract schema json (for example [this one](https://api-dev.datausa.io/tesseract/debug/schema)) and store it in the **`/helpers`** folder. + + 2. Run the following command in the terminal (replacing the file names): + ``` + python tesseract_schema_mapping.py + ``` + # API beeing served by FastAPI From f4b9fcd9669929b60d9369996687671576656899 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Tue, 12 Mar 2024 18:09:52 -0300 Subject: [PATCH 06/29] Update README --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0c8eeeb..99bb3c7 100644 --- a/README.md +++ b/README.md @@ -95,13 +95,14 @@ In order to add one cube, the steps are: - name - api (Tesseract or Mondrian) - description - - measures - ```json - { - "name": "Millions Of Dollars", - "description": "value in millions of dollars of a certain shipment." - } - ``` + - measures: + ```json + { + "name": "Millions Of Dollars", + "description": "value in millions of dollars of a shipment" + } + ``` + - dimensions - Add each hierarchy separately, filling the following fields for each: ```json From e85829c3a4cf74387bf57838b434d446ed911365 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 10:17:48 -0300 Subject: [PATCH 07/29] add postgres engine and clean config.py --- api/src/config.py | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/api/src/config.py b/api/src/config.py index bdba90f..8d873af 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -1,37 +1,34 @@ import openai - from os import getenv from dotenv import load_dotenv from sqlalchemy import create_engine +# Load .env file if exists load_dotenv() -TESSERACT_URL = getenv("TESSERACT_URL") -OPENAI_KEY = getenv("OPENAI_KEY") - -POSTGRES_DB = getenv("POSTGRES_DB") -POSTGRES_URL = getenv("POSTGRES_URL") -POSTGRES_PASSWORD = getenv("POSTGRES_PASSWORD") +# PostgreSQL Connection POSTGRES_USER = getenv("POSTGRES_USER") +POSTGRES_PASSWORD = getenv("POSTGRES_PASSWORD") +POSTGRES_HOST = getenv("POSTGRES_HOST") +POSTGRES_DB = getenv("POSTGRES_DB") +POSTGRES_PORT = 5432 -EVENTS_DB = getenv("EVENTS_DB") -EVENTS_URL = getenv("EVENTS_URL") -EVENTS_PASSWORD = getenv("EVENTS_PASSWORD") -EVENTS_USER = getenv("EVENTS_USER") +if POSTGRES_HOST: + POSTGRES_ENGINE = create_engine('postgresql+psycopg2://{}:{}@{}:{}/{}'.format(POSTGRES_USER,POSTGRES_PASSWORD,POSTGRES_HOST,POSTGRES_PORT,POSTGRES_DB)) +else: + print('POSTGRES_HOST not found, please check your environment') + exit(1) + +# OpenAI Connection +OPENAI_KEY = getenv("OPENAI_KEY") openai.api_key = OPENAI_KEY -if POSTGRES_URL: - ENGINE = create_engine('postgresql+psycopg2://{}:{}@{}:5432/{}'.format(POSTGRES_USER,POSTGRES_PASSWORD,POSTGRES_URL,POSTGRES_DB)) - dialect_mapping = { - "postgresql": "PostgreSQL 14", - } - DIALECT = dialect_mapping.get(ENGINE.dialect.name) -else: - print('POSTGRES_URL not found, please check your environment') +if not OPENAI_KEY: + print('OPENAI_KEY not found, please check your environment') exit(1) -if EVENTS_URL: - EVENTS_ENGINE = create_engine(EVENTS_URL) -else: - EVENTS_ENGINE = None +# Tesseract Connection +TESSERACT_API = getenv("TESSERACT_API") + +print('here in config: {}'.format(POSTGRES_ENGINE.connect())) \ No newline at end of file From 250b9748080d688bfc9e4f7d76818f5a2a3b7715 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 10:18:20 -0300 Subject: [PATCH 08/29] replace postgres engine to config.py on required scripts --- .../api_data_request/similarity_search.py | 21 ++----------------- api/src/utils/helpers/cubes_to_db.py | 16 +++----------- api/src/utils/helpers/drilldowns_to_db.py | 16 +++----------- .../table_selection/table_database_search.py | 16 ++------------ 4 files changed, 10 insertions(+), 59 deletions(-) diff --git a/api/src/utils/api_data_request/similarity_search.py b/api/src/utils/api_data_request/similarity_search.py index f3160b4..67f8639 100644 --- a/api/src/utils/api_data_request/similarity_search.py +++ b/api/src/utils/api_data_request/similarity_search.py @@ -1,30 +1,13 @@ -import os import pandas as pd -from sqlalchemy import create_engine +from src.config import POSTGRES_ENGINE from sentence_transformers import SentenceTransformer -POSTGRES_USERNAME = os.getenv('POSTGRES_USER') -POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') -POSTGRES_URL = os.getenv('POSTGRES_URL') -POSTGRES_DATABASE = os.getenv('POSTGRES_DB') - - def get_similar_content(text, cube_name, drilldown_names, threshold=0, content_limit=1, embedding_model='multi-qa-MiniLM-L6-cos-v1', verbose=False): """ Receives a string, computes its embedding, and then looks for similar content in a database based on the given cube and drilldown levels. Returns top match, similarity score, and others depending on the drilldown. """ - - POSTGRES_USERNAME = os.getenv('POSTGRES_USER') - POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') - POSTGRES_URL = os.getenv('POSTGRES_URL') - POSTGRES_DATABASE = os.getenv('POSTGRES_DB') - - engine = create_engine( - 'postgresql+psycopg2://{}:{}@{}:5432/{}'.format(POSTGRES_USERNAME, POSTGRES_PASSWORD, POSTGRES_URL, - POSTGRES_DATABASE)) - model = SentenceTransformer(embedding_model) # 384 embedding = model.encode([text]) @@ -34,7 +17,7 @@ def get_similar_content(text, cube_name, drilldown_names, threshold=0, content_l if verbose: print(query) - df = pd.read_sql(query,con=engine) + df = pd.read_sql(query,con=POSTGRES_ENGINE) if verbose: print(df) diff --git a/api/src/utils/helpers/cubes_to_db.py b/api/src/utils/helpers/cubes_to_db.py index b4eee49..c66efb7 100644 --- a/api/src/utils/helpers/cubes_to_db.py +++ b/api/src/utils/helpers/cubes_to_db.py @@ -1,18 +1,8 @@ -import os import pandas as pd -from sqlalchemy import create_engine +from src.config import POSTGRES_ENGINE from sentence_transformers import SentenceTransformer - -POSTGRES_USERNAME = os.getenv('POSTGRES_USER') -POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') -POSTGRES_URL = os.getenv('POSTGRES_URL') -POSTGRES_DATABASE = os.getenv('POSTGRES_DB') - -engine = create_engine('postgresql+psycopg2://{}:{}@{}:5432/{}'.format(POSTGRES_USERNAME,POSTGRES_PASSWORD,POSTGRES_URL,POSTGRES_DATABASE)) -conn = engine.connect() - def embedding(dataframe, column): """ Creates embeddings for text in the passed column @@ -26,7 +16,7 @@ def embedding(dataframe, column): def create_table(): - engine.execute("CREATE TABLE IF NOT EXISTS datausa_tables.cubes (table_name text, table_description text, embedding vector(384))") + POSTGRES_ENGINE.execute("CREATE TABLE IF NOT EXISTS datausa_tables.cubes (table_name text, table_description text, embedding vector(384))") return @@ -35,7 +25,7 @@ def load_data_to_db(df): print(df.head()) df_embeddings = embedding(df, 'table_description') - df_embeddings.to_sql('cubes', conn, if_exists='append', index=False, schema='datausa_tables') + df_embeddings.to_sql('cubes', con=POSTGRES_ENGINE, if_exists='append', index=False, schema='datausa_tables') return diff --git a/api/src/utils/helpers/drilldowns_to_db.py b/api/src/utils/helpers/drilldowns_to_db.py index fde1130..825b8d9 100644 --- a/api/src/utils/helpers/drilldowns_to_db.py +++ b/api/src/utils/helpers/drilldowns_to_db.py @@ -1,20 +1,10 @@ -import os import pandas as pd import requests import urllib.parse -from sqlalchemy import create_engine +from src.config import POSTGRES_ENGINE from sentence_transformers import SentenceTransformer - -POSTGRES_USERNAME = os.getenv('POSTGRES_USER') -POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') -POSTGRES_URL = os.getenv('POSTGRES_URL') -POSTGRES_DATABASE = os.getenv('POSTGRES_DB') - -engine = create_engine('postgresql+psycopg2://{}:{}@{}:5432/{}'.format(POSTGRES_USERNAME,POSTGRES_PASSWORD,POSTGRES_URL,POSTGRES_DATABASE)) -conn = engine.connect() - def embedding(dataframe, column): """ Creates embeddings for text in the passed column @@ -28,7 +18,7 @@ def embedding(dataframe, column): def create_table(): - engine.execute("CREATE TABLE IF NOT EXISTS datausa_drilldowns.drilldowns (product_id text, product_name text, cube_name text, drilldown text, embedding vector(384))") + POSTGRES_ENGINE.execute("CREATE TABLE IF NOT EXISTS datausa_drilldowns.drilldowns (product_id text, product_name text, cube_name text, drilldown text, embedding vector(384))") return @@ -67,7 +57,7 @@ def load_data_to_db(api_url, measure_name): print(df.head()) df_embeddings = embedding(df, 'product_name') - df_embeddings.to_sql('drilldowns', conn, if_exists='append', index=False, schema='datausa_drilldowns') + df_embeddings.to_sql('drilldowns', con=POSTGRES_ENGINE, if_exists='append', index=False, schema='datausa_drilldowns') return diff --git a/api/src/utils/table_selection/table_database_search.py b/api/src/utils/table_selection/table_database_search.py index 7ccdcca..9034c66 100644 --- a/api/src/utils/table_selection/table_database_search.py +++ b/api/src/utils/table_selection/table_database_search.py @@ -1,28 +1,16 @@ -import os import pandas as pd +from src.config import POSTGRES_ENGINE from typing import List -from sqlalchemy import create_engine - def get_similar_tables(vector, threshold=0, content_limit=1) -> List[str]: """ Receives a string, computes its embedding and then looks for similar content in a database. Returns top match, similarity score, and others depending on the drilldown. """ - - # Postgres - POSTGRES_USERNAME = os.getenv('POSTGRES_USER') - POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') - POSTGRES_URL = os.getenv('POSTGRES_URL') - POSTGRES_DATABASE = os.getenv('POSTGRES_DB') - - engine = create_engine('postgresql+psycopg2://{}:{}@{}:5432/{}'.format(POSTGRES_USERNAME,POSTGRES_PASSWORD,POSTGRES_URL,POSTGRES_DATABASE)) - - query = """select table_name, similarity from "match_table"('{}','{}' ,'{}'); """.format(vector[0].tolist().__str__(), str(threshold), str(content_limit)) - df = pd.read_sql(query, con=engine) + df = pd.read_sql(query, con=POSTGRES_ENGINE) tables = df['table_name'].tolist() return tables \ No newline at end of file From 718fcb5991f396d41c736b8013e7ab8f57b282ef Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 10:36:18 -0300 Subject: [PATCH 09/29] replace postgres engine to config.py on required scripts --- api/src/utils/logs.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/api/src/utils/logs.py b/api/src/utils/logs.py index b4ba6e1..9a6cfa2 100644 --- a/api/src/utils/logs.py +++ b/api/src/utils/logs.py @@ -3,14 +3,9 @@ import time from datetime import datetime -from os import getenv from sqlalchemy import text -from sqlalchemy import create_engine -POSTGRES_USERNAME = getenv('POSTGRES_USER') -POSTGRES_PASSWORD = getenv('POSTGRES_PASSWORD') -POSTGRES_URL = getenv('POSTGRES_URL') -POSTGRES_DATABASE = getenv('POSTGRES_DB') +from src.config import POSTGRES_ENGINE def generate_custom_id(): timestamp = str(int(time.time())) @@ -40,9 +35,7 @@ def log_apicall(query, api_url, response, drilldowns, measures, cuts, cube, dura VALUES (:query_id, :question, :api_url, :response, :created_on, :drilldowns, :measures, :cuts, :cube, :duration) """) - engine = create_engine('postgresql+psycopg2://{}:{}@{}:5432/{}'.format(POSTGRES_USERNAME,POSTGRES_PASSWORD,POSTGRES_URL,POSTGRES_DATABASE)) - - with engine.connect() as conn: + with POSTGRES_ENGINE.connect() as conn: conn.execute(insert_query, params) conn.commit() From 349f9e7a4685fa336ba4a46bbbe8ab5fee7f04b8 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 10:37:29 -0300 Subject: [PATCH 10/29] update import logic on config.py --- api/src/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/src/config.py b/api/src/config.py index 8d873af..c10a1bb 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -1,4 +1,5 @@ import openai + from os import getenv from dotenv import load_dotenv from sqlalchemy import create_engine From d371926742725f358ec646c4c4c5d5f51a46b8ac Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 11:05:40 -0300 Subject: [PATCH 11/29] add OLLAMA_API env var --- api/src/config.py | 13 +++++++---- .../utils/api_data_request/api_generator.py | 22 +++---------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/api/src/config.py b/api/src/config.py index c10a1bb..2b4e2e9 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -22,14 +22,19 @@ # OpenAI Connection OPENAI_KEY = getenv("OPENAI_KEY") +# os.environ["TOKENIZERS_PARALLELISM"] = "false" -openai.api_key = OPENAI_KEY - -if not OPENAI_KEY: +if OPENAI_KEY: + openai.api_key = OPENAI_KEY +else: print('OPENAI_KEY not found, please check your environment') exit(1) +# OLLAMA Connection +OLLAMA_API = getenv("OLLAMA_API") + # Tesseract Connection TESSERACT_API = getenv("TESSERACT_API") -print('here in config: {}'.format(POSTGRES_ENGINE.connect())) \ No newline at end of file +# Mondrian Connection +MONDRIAN_API = getenv('MONDRIAN_API') \ No newline at end of file diff --git a/api/src/utils/api_data_request/api_generator.py b/api/src/utils/api_data_request/api_generator.py index 0c5aeeb..2141c5a 100644 --- a/api/src/utils/api_data_request/api_generator.py +++ b/api/src/utils/api_data_request/api_generator.py @@ -1,30 +1,14 @@ +import json import openai -import os -import pandas as pd import requests import time -import json -from os import getenv -from dotenv import load_dotenv +from src.config import OLLAMA_API from src.utils.table_selection.table_details import * from src.utils.preprocessors.text import * from src.utils.api_data_request.similarity_search import * from src.utils.api_data_request.api import * -load_dotenv() - -# environment initialization -os.environ["TOKENIZERS_PARALLELISM"] = "false" - -# variable initialization -OPENAI_KEY = getenv("OPENAI_KEY") -openai.api_key = OPENAI_KEY - -TESSERACT_API = getenv("TESSERACT_API") -MONDRIAN_API = getenv('MONDRIAN_API') - - def get_api_components_messages(table, model_author, natural_language_query = ""): response_part = """ @@ -157,7 +141,7 @@ def get_api_params_from_lm(natural_language_query, table = None, model="gpt-4", cuts = json.loads(params).get("filters") elif model_author == "llama": - url = "https://caleuche-ollama.datawheel.us/api/generate" + url = "{}generate".format(OLLAMA_API) print(content) payload = { "model": model, From 7aa986a2918a6e56373fffd56e01775c2a66f904 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 11:09:16 -0300 Subject: [PATCH 12/29] clean unused os dependency --- api/src/utils/table_selection/table_details.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/src/utils/table_selection/table_details.py b/api/src/utils/table_selection/table_details.py index 4e7d1f8..e6ae7f4 100644 --- a/api/src/utils/table_selection/table_details.py +++ b/api/src/utils/table_selection/table_details.py @@ -1,6 +1,5 @@ import json -from os import getenv from typing import List class Table: From e77c5db5ea0721afa80fb99d0577c1740bbf8047 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 11:45:51 -0300 Subject: [PATCH 13/29] change mondrian and tesseract reference --- api/src/utils/api_data_request/api.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/api/src/utils/api_data_request/api.py b/api/src/utils/api_data_request/api.py index d44d8a2..26a71c0 100644 --- a/api/src/utils/api_data_request/api.py +++ b/api/src/utils/api_data_request/api.py @@ -1,27 +1,11 @@ -import openai -import os -import pandas as pd import requests -import json +import pandas as pd -from os import getenv -from dotenv import load_dotenv +from src.config import MONDRIAN_API, TESSERACT_API from src.utils.table_selection.table_details import * from src.utils.preprocessors.text import * from src.utils.api_data_request.similarity_search import * -load_dotenv() - -# environment initialization -os.environ["TOKENIZERS_PARALLELISM"] = "false" - -# drilldown initialization -OPENAI_KEY = getenv("OPENAI_KEY") -openai.api_key = OPENAI_KEY - -TESSERACT_API = getenv("TESSERACT_API") -MONDRIAN_API = getenv('MONDRIAN_API') - class ApiBuilder: def __init__(self, base_url): From a048ccc063fda3aafc834a1beb73d5c2e965b5a0 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 11:46:17 -0300 Subject: [PATCH 14/29] change openai reference --- api/src/utils/data_analysis/data_analysis.py | 17 ++--------------- api/src/utils/table_selection/table_selector.py | 8 +------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/api/src/utils/data_analysis/data_analysis.py b/api/src/utils/data_analysis/data_analysis.py index 655dfab..8cd8ef0 100644 --- a/api/src/utils/data_analysis/data_analysis.py +++ b/api/src/utils/data_analysis/data_analysis.py @@ -1,19 +1,6 @@ -import os - -from os import getenv -from dotenv import load_dotenv +from src.config import OPENAI_KEY from langchain.agents import create_pandas_dataframe_agent from langchain.chat_models import ChatOpenAI -from langchain import OpenAI - -load_dotenv() - -# environment initialization -os.environ["TOKENIZERS_PARALLELISM"] = "false" - -# variable initialization -OPENAI_API_KEY = getenv("OPENAI_KEY") -openai_api_key = OPENAI_API_KEY def agent_answer(df, natural_language_query): @@ -33,7 +20,7 @@ def agent_answer(df, natural_language_query): """ ) - llm = ChatOpenAI(model_name='gpt-4-1106-preview', temperature=0, openai_api_key=openai_api_key) + llm = ChatOpenAI(model_name='gpt-4-1106-preview', temperature=0, openai_api_key=OPENAI_KEY) agent = create_pandas_dataframe_agent(llm, df, verbose=True) response = agent.run(prompt) diff --git a/api/src/utils/table_selection/table_selector.py b/api/src/utils/table_selection/table_selector.py index 7d3e3fd..be94914 100644 --- a/api/src/utils/table_selection/table_selector.py +++ b/api/src/utils/table_selection/table_selector.py @@ -1,21 +1,15 @@ import json import openai import time + from typing import List from sentence_transformers import SentenceTransformer -from os import getenv -from dotenv import load_dotenv from src.utils.table_selection.table_details import * from src.utils.table_selection.table_database_search import get_similar_tables from src.utils.few_shot_examples import get_few_shot_example_messages from src.utils.preprocessors.text import extract_text_from_markdown_triple_backticks -load_dotenv() - -OPENAI_KEY = getenv("OPENAI_KEY") -openai.api_key = OPENAI_KEY - def _get_table_selection_message_with_descriptions(table_manager, table_names: List[str] = None): message = ( f""" From b95b094ecba758cd82a0f7dfd5a22ff0d4f2c8e7 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 11:46:32 -0300 Subject: [PATCH 15/29] change psql reference --- api/src/utils/api_data_request/similarity_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/src/utils/api_data_request/similarity_search.py b/api/src/utils/api_data_request/similarity_search.py index 67f8639..3623d45 100644 --- a/api/src/utils/api_data_request/similarity_search.py +++ b/api/src/utils/api_data_request/similarity_search.py @@ -1,8 +1,9 @@ import pandas as pd -from src.config import POSTGRES_ENGINE from sentence_transformers import SentenceTransformer +from src.config import POSTGRES_ENGINE + def get_similar_content(text, cube_name, drilldown_names, threshold=0, content_limit=1, embedding_model='multi-qa-MiniLM-L6-cos-v1', verbose=False): """ Receives a string, computes its embedding, and then looks for similar content in a database based on the given cube and drilldown levels. From 087e7ce767e41d8f32fe839ce5fb0f6d8374c8aa Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 11:49:07 -0300 Subject: [PATCH 16/29] add getenv on app.py --- api/src/utils/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api/src/utils/app.py b/api/src/utils/app.py index c9ed3f5..6244c6f 100644 --- a/api/src/utils/app.py +++ b/api/src/utils/app.py @@ -1,5 +1,7 @@ import time +from os import getenv + from src.utils.table_selection.table_selector import * from src.utils.table_selection.table_details import * from src.utils.api_data_request.api_generator import * From 344bbad7d966dc3ecc079e0b42fc410183c5d999 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 14:52:19 -0300 Subject: [PATCH 17/29] add TABLES_PATH to config.py --- api/src/config.py | 5 ++++- api/src/main.py | 29 ++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/api/src/config.py b/api/src/config.py index 2b4e2e9..7df3838 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -37,4 +37,7 @@ TESSERACT_API = getenv("TESSERACT_API") # Mondrian Connection -MONDRIAN_API = getenv('MONDRIAN_API') \ No newline at end of file +MONDRIAN_API = getenv('MONDRIAN_API') + +# Files Directories +TABLES_PATH = getenv('TABLES_PATH') \ No newline at end of file diff --git a/api/src/main.py b/api/src/main.py index a72f152..44d1ad9 100644 --- a/api/src/main.py +++ b/api/src/main.py @@ -1,29 +1,28 @@ -from dotenv import load_dotenv from fastapi import FastAPI -from os import getenv -from src.utils.app import get_api + +from config import TABLES_PATH +from utils.app import get_api # fastapi instance declaration app = FastAPI() -# get tables path -load_dotenv() -TABLES_PATH = getenv('TABLES_PATH') - # api functions @app.get("/") async def root(): - return {"message": "Hello World"} + return { + "name": "datausa-chat-api", + "status": "ok" + } @app.get("/query/{query}") async def read_item(query: str): api_url, data, text_response = get_api(query, TABLES_PATH) return { - "query": - { - "question": query, - "answer": text_response, - "url": api_url - } - } \ No newline at end of file + "query": + { + "question": query, + "answer": text_response, + "url": api_url + } + } \ No newline at end of file From 4e652e72dcb078a963364760d6e71e67a09547f1 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 14:52:55 -0300 Subject: [PATCH 18/29] change env var PYTHONPATH to remove src. from the module call --- api/src/utils/api_data_request/api.py | 8 ++++---- api/src/utils/api_data_request/api_generator.py | 10 +++++----- api/src/utils/api_data_request/similarity_search.py | 2 +- api/src/utils/app.py | 10 +++++----- api/src/utils/data_analysis/data_analysis.py | 2 +- api/src/utils/helpers/cubes_to_db.py | 2 +- api/src/utils/helpers/drilldowns_to_db.py | 2 +- api/src/utils/logs.py | 2 +- api/src/utils/messages.py | 2 +- api/src/utils/table_selection/table_database_search.py | 2 +- api/src/utils/table_selection/table_selector.py | 8 ++++---- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/api/src/utils/api_data_request/api.py b/api/src/utils/api_data_request/api.py index 26a71c0..91f7d5b 100644 --- a/api/src/utils/api_data_request/api.py +++ b/api/src/utils/api_data_request/api.py @@ -1,10 +1,10 @@ import requests import pandas as pd -from src.config import MONDRIAN_API, TESSERACT_API -from src.utils.table_selection.table_details import * -from src.utils.preprocessors.text import * -from src.utils.api_data_request.similarity_search import * +from config import MONDRIAN_API, TESSERACT_API +from utils.table_selection.table_details import * +from utils.preprocessors.text import * +from utils.api_data_request.similarity_search import * class ApiBuilder: diff --git a/api/src/utils/api_data_request/api_generator.py b/api/src/utils/api_data_request/api_generator.py index 2141c5a..d037aa1 100644 --- a/api/src/utils/api_data_request/api_generator.py +++ b/api/src/utils/api_data_request/api_generator.py @@ -3,11 +3,11 @@ import requests import time -from src.config import OLLAMA_API -from src.utils.table_selection.table_details import * -from src.utils.preprocessors.text import * -from src.utils.api_data_request.similarity_search import * -from src.utils.api_data_request.api import * +from config import OLLAMA_API +from utils.table_selection.table_details import * +from utils.preprocessors.text import * +from utils.api_data_request.similarity_search import * +from utils.api_data_request.api import * def get_api_components_messages(table, model_author, natural_language_query = ""): diff --git a/api/src/utils/api_data_request/similarity_search.py b/api/src/utils/api_data_request/similarity_search.py index 3623d45..14e9976 100644 --- a/api/src/utils/api_data_request/similarity_search.py +++ b/api/src/utils/api_data_request/similarity_search.py @@ -2,7 +2,7 @@ from sentence_transformers import SentenceTransformer -from src.config import POSTGRES_ENGINE +from config import POSTGRES_ENGINE def get_similar_content(text, cube_name, drilldown_names, threshold=0, content_limit=1, embedding_model='multi-qa-MiniLM-L6-cos-v1', verbose=False): """ diff --git a/api/src/utils/app.py b/api/src/utils/app.py index 6244c6f..0775ec7 100644 --- a/api/src/utils/app.py +++ b/api/src/utils/app.py @@ -2,11 +2,11 @@ from os import getenv -from src.utils.table_selection.table_selector import * -from src.utils.table_selection.table_details import * -from src.utils.api_data_request.api_generator import * -from src.utils.data_analysis.data_analysis import * -from src.utils.logs import * +from utils.table_selection.table_selector import * +from utils.table_selection.table_details import * +from utils.api_data_request.api_generator import * +from utils.data_analysis.data_analysis import * +from utils.logs import * def get_api(query, TABLES_PATH): start_time = time.time() diff --git a/api/src/utils/data_analysis/data_analysis.py b/api/src/utils/data_analysis/data_analysis.py index 8cd8ef0..5b93369 100644 --- a/api/src/utils/data_analysis/data_analysis.py +++ b/api/src/utils/data_analysis/data_analysis.py @@ -1,4 +1,4 @@ -from src.config import OPENAI_KEY +from config import OPENAI_KEY from langchain.agents import create_pandas_dataframe_agent from langchain.chat_models import ChatOpenAI diff --git a/api/src/utils/helpers/cubes_to_db.py b/api/src/utils/helpers/cubes_to_db.py index c66efb7..0f67019 100644 --- a/api/src/utils/helpers/cubes_to_db.py +++ b/api/src/utils/helpers/cubes_to_db.py @@ -1,6 +1,6 @@ import pandas as pd -from src.config import POSTGRES_ENGINE +from config import POSTGRES_ENGINE from sentence_transformers import SentenceTransformer def embedding(dataframe, column): diff --git a/api/src/utils/helpers/drilldowns_to_db.py b/api/src/utils/helpers/drilldowns_to_db.py index 825b8d9..01a2a6c 100644 --- a/api/src/utils/helpers/drilldowns_to_db.py +++ b/api/src/utils/helpers/drilldowns_to_db.py @@ -2,7 +2,7 @@ import requests import urllib.parse -from src.config import POSTGRES_ENGINE +from config import POSTGRES_ENGINE from sentence_transformers import SentenceTransformer def embedding(dataframe, column): diff --git a/api/src/utils/logs.py b/api/src/utils/logs.py index 9a6cfa2..63a69d7 100644 --- a/api/src/utils/logs.py +++ b/api/src/utils/logs.py @@ -5,7 +5,7 @@ from datetime import datetime from sqlalchemy import text -from src.config import POSTGRES_ENGINE +from config import POSTGRES_ENGINE def generate_custom_id(): timestamp = str(int(time.time())) diff --git a/api/src/utils/messages.py b/api/src/utils/messages.py index 9480389..f692068 100644 --- a/api/src/utils/messages.py +++ b/api/src/utils/messages.py @@ -4,7 +4,7 @@ import openai from typing import List, Dict -from src.utils.logs import log_apicall +from utils.logs import log_apicall def get_assistant_message_from_openai( messages: List[Dict[str, str]], diff --git a/api/src/utils/table_selection/table_database_search.py b/api/src/utils/table_selection/table_database_search.py index 9034c66..816b088 100644 --- a/api/src/utils/table_selection/table_database_search.py +++ b/api/src/utils/table_selection/table_database_search.py @@ -1,6 +1,6 @@ import pandas as pd -from src.config import POSTGRES_ENGINE +from config import POSTGRES_ENGINE from typing import List def get_similar_tables(vector, threshold=0, content_limit=1) -> List[str]: diff --git a/api/src/utils/table_selection/table_selector.py b/api/src/utils/table_selection/table_selector.py index be94914..08d5242 100644 --- a/api/src/utils/table_selection/table_selector.py +++ b/api/src/utils/table_selection/table_selector.py @@ -5,10 +5,10 @@ from typing import List from sentence_transformers import SentenceTransformer -from src.utils.table_selection.table_details import * -from src.utils.table_selection.table_database_search import get_similar_tables -from src.utils.few_shot_examples import get_few_shot_example_messages -from src.utils.preprocessors.text import extract_text_from_markdown_triple_backticks +from utils.table_selection.table_details import * +from utils.table_selection.table_database_search import get_similar_tables +from utils.few_shot_examples import get_few_shot_example_messages +from utils.preprocessors.text import extract_text_from_markdown_triple_backticks def _get_table_selection_message_with_descriptions(table_manager, table_names: List[str] = None): message = ( From 35dbbeee4527335e7fa6fc8036023f9a7367700c Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 14:54:44 -0300 Subject: [PATCH 19/29] change ' to " for consistency on config --- api/src/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/api/src/config.py b/api/src/config.py index 7df3838..be2ad84 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -15,9 +15,9 @@ POSTGRES_PORT = 5432 if POSTGRES_HOST: - POSTGRES_ENGINE = create_engine('postgresql+psycopg2://{}:{}@{}:{}/{}'.format(POSTGRES_USER,POSTGRES_PASSWORD,POSTGRES_HOST,POSTGRES_PORT,POSTGRES_DB)) + POSTGRES_ENGINE = create_engine("postgresql+psycopg2://{}:{}@{}:{}/{}".format(POSTGRES_USER,POSTGRES_PASSWORD,POSTGRES_HOST,POSTGRES_PORT,POSTGRES_DB)) else: - print('POSTGRES_HOST not found, please check your environment') + print("POSTGRES_HOST not found, please check your environment") exit(1) # OpenAI Connection @@ -27,7 +27,7 @@ if OPENAI_KEY: openai.api_key = OPENAI_KEY else: - print('OPENAI_KEY not found, please check your environment') + print("OPENAI_KEY not found, please check your environment") exit(1) # OLLAMA Connection @@ -37,7 +37,7 @@ TESSERACT_API = getenv("TESSERACT_API") # Mondrian Connection -MONDRIAN_API = getenv('MONDRIAN_API') +MONDRIAN_API = getenv("MONDRIAN_API") # Files Directories -TABLES_PATH = getenv('TABLES_PATH') \ No newline at end of file +TABLES_PATH = getenv("TABLES_PATH") \ No newline at end of file From cee833f602a224bce32f095f3d089485e25db438 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 14:57:46 -0300 Subject: [PATCH 20/29] add FEW_SHOT_PATH to config.py --- api/src/config.py | 1 + api/src/utils/few_shot_examples.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/api/src/config.py b/api/src/config.py index be2ad84..55f598e 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -40,4 +40,5 @@ MONDRIAN_API = getenv("MONDRIAN_API") # Files Directories +FEW_SHOT_PATH = getenv("FEW_SHOT_PATH") TABLES_PATH = getenv("TABLES_PATH") \ No newline at end of file diff --git a/api/src/utils/few_shot_examples.py b/api/src/utils/few_shot_examples.py index 75fee9e..4ed34f0 100644 --- a/api/src/utils/few_shot_examples.py +++ b/api/src/utils/few_shot_examples.py @@ -1,15 +1,13 @@ import json from typing import List -from os import getenv -FEW_SHOT_PATH = getenv('FEW_SHOT_PATH') +from config import FEW_SHOT_PATH few_shot_examples = {} with open(FEW_SHOT_PATH, "r") as f: few_shot_examples = json.load(f) - def get_few_shot_example_messages(mode: str = "table_selection", n=-1) -> List[dict]: examples = few_shot_examples.get("USA", {}).get(mode, []) if n > 0: From f44196d77195f6c41c732b903337f28d3139d934 Mon Sep 17 00:00:00 2001 From: nspmx Date: Thu, 14 Mar 2024 14:59:44 -0300 Subject: [PATCH 21/29] remove unused variable at config.py --- api/src/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/src/config.py b/api/src/config.py index 55f598e..2ed32a0 100644 --- a/api/src/config.py +++ b/api/src/config.py @@ -22,7 +22,6 @@ # OpenAI Connection OPENAI_KEY = getenv("OPENAI_KEY") -# os.environ["TOKENIZERS_PARALLELISM"] = "false" if OPENAI_KEY: openai.api_key = OPENAI_KEY From cb20279d1c069046f403c4972139b925987615e3 Mon Sep 17 00:00:00 2001 From: nspmx Date: Fri, 15 Mar 2024 14:32:42 -0300 Subject: [PATCH 22/29] update dockerfile to new structure --- api/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/api/Dockerfile b/api/Dockerfile index b625da9..e96a18e 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -34,9 +34,10 @@ RUN apt-get install -y --no-install-recommends \ # copy app files COPY --from=build /usr/app/venv ./venv -COPY . . +COPY /src . +COPY /data ./data ENV PATH="/usr/app/venv/bin:$PATH" # Run the app -CMD [ "uvicorn", "src.main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80", "--timeout-keep-alive", "120" ] \ No newline at end of file +CMD [ "uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80", "--timeout-keep-alive", "120" ] \ No newline at end of file From a01ddc76863985b09a9457bf14c2bbeaf3017e0a Mon Sep 17 00:00:00 2001 From: nspmx Date: Fri, 15 Mar 2024 15:28:35 -0300 Subject: [PATCH 23/29] change POSTGRES_HOST on workflow --- .github/workflows/google-registry-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/google-registry-api.yaml b/.github/workflows/google-registry-api.yaml index 2e2dc54..546df53 100644 --- a/.github/workflows/google-registry-api.yaml +++ b/.github/workflows/google-registry-api.yaml @@ -154,7 +154,7 @@ jobs: --set configMap.FEW_SHOT_PATH=${{ vars.FEW_SHOT_PATH }} \ --set configMap.MONDRIAN_API=${{ vars.MONDRIAN_API }} \ --set configMap.POSTGRES_DB=${{ vars.POSTGRES_DB }} \ - --set configMap.POSTGRES_URL=${{ vars.POSTGRES_URL }} \ + --set configMap.POSTGRES_HOST=${{ vars.POSTGRES_HOST }} \ --set configMap.POSTGRES_USER=${{ vars.POSTGRES_USER }} \ --set configMap.TABLES_PATH=${{ vars.TABLES_PATH }} \ --set configMap.TESSERACT_API=${{ vars.TESSERACT_API }} \ From b341fc72b5307a081c67128779c38b7a3ee10042 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Wed, 20 Mar 2024 16:47:42 -0300 Subject: [PATCH 24/29] Add scripts to load cubes and drilldowns to db --- .../helpers/{cubes_to_db.py => cube_to_db.py} | 0 api/src/utils/helpers/drilldowns_to_db.py | 14 +- api/src/utils/helpers/load_cubes_to_db.py | 48 + .../utils/helpers/load_drilldowns_to_db.py | 92 ++ api/src/utils/helpers/output.json | 1366 +++++++++++++++++ 5 files changed, 1516 insertions(+), 4 deletions(-) rename api/src/utils/helpers/{cubes_to_db.py => cube_to_db.py} (100%) create mode 100644 api/src/utils/helpers/load_cubes_to_db.py create mode 100644 api/src/utils/helpers/load_drilldowns_to_db.py create mode 100644 api/src/utils/helpers/output.json diff --git a/api/src/utils/helpers/cubes_to_db.py b/api/src/utils/helpers/cube_to_db.py similarity index 100% rename from api/src/utils/helpers/cubes_to_db.py rename to api/src/utils/helpers/cube_to_db.py diff --git a/api/src/utils/helpers/drilldowns_to_db.py b/api/src/utils/helpers/drilldowns_to_db.py index 01a2a6c..6f6cf6d 100644 --- a/api/src/utils/helpers/drilldowns_to_db.py +++ b/api/src/utils/helpers/drilldowns_to_db.py @@ -48,16 +48,22 @@ def load_data_to_db(api_url, measure_name): cube_name, drilldown = get_api_params(api_url) df = get_data_from_api(api_url=api_url) - df.rename(columns={f"{drilldown}": "product_name", f"{drilldown} ID": "product_id"}, inplace=True) + df.rename(columns={f"{drilldown}": "drilldown_name", f"{drilldown} ID": "drilldown_id"}, inplace=True) df['cube_name'] = f"{cube_name}" df['drilldown'] = f"{drilldown}" df.drop(f"{measure_name}", axis=1, inplace=True) + if 'drilldown_id' not in df.columns: + df['drilldown_id'] = df['drilldown'] + + df.replace('', pd.NA, inplace=True) + df.dropna(subset=['drilldown_name', 'drilldown_id'], how='all', inplace=True) + print(df.head()) - df_embeddings = embedding(df, 'product_name') - df_embeddings.to_sql('drilldowns', con=POSTGRES_ENGINE, if_exists='append', index=False, schema='datausa_drilldowns') + #df_embeddings = embedding(df, 'product_name') + #df_embeddings.to_sql('drilldowns', con=POSTGRES_ENGINE, if_exists='append', index=False, schema='datausa_drilldowns') return @@ -69,5 +75,5 @@ def load_data_to_db(api_url, measure_name): #df = pd.read_csv('/Users/alexandrabjanes/Datawheel/CODE/datausa-chat/tables.csv') #print(df.head()) -create_table() +#create_table() load_data_to_db(api_url, measure_name = measure_name) diff --git a/api/src/utils/helpers/load_cubes_to_db.py b/api/src/utils/helpers/load_cubes_to_db.py new file mode 100644 index 0000000..48dc50d --- /dev/null +++ b/api/src/utils/helpers/load_cubes_to_db.py @@ -0,0 +1,48 @@ +import json +import pandas as pd +from config import POSTGRES_ENGINE +from sentence_transformers import SentenceTransformer + +# ENV Variables + +table_name = 'cubes' +schema_name = 'datausa_tables' +embedding_size = 384 + +def embedding(dataframe, column): + """ + Creates embeddings for text in the column passed as argument + """ + model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') + + model_embeddings = model.encode(dataframe[column].to_list()) + dataframe['embedding'] = model_embeddings.tolist() + + return dataframe + +def create_table(table_name, schema_name, embedding_size = 384): + POSTGRES_ENGINE.execute(f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (table_name text, table_description text, embedding vector({embedding_size}))") + return + +def load_data_to_db(df, table_name, schema_name): + df_embeddings = embedding(df, 'table_description') + df_embeddings.to_sql(table_name, con=POSTGRES_ENGINE, if_exists='append', index=False, schema=schema_name) + return + +with open('output.json', 'r') as file: + cubes_data = json.load(file) + +cubes = [] + +for cube in cubes_data["tables"]: + cube_info = { + "table_name": cube["name"], + "table_description": cube["description"] + } + cubes.append(cube_info) + +df = pd.DataFrame(cubes) + +create_table() + +load_data_to_db(df) \ No newline at end of file diff --git a/api/src/utils/helpers/load_drilldowns_to_db.py b/api/src/utils/helpers/load_drilldowns_to_db.py new file mode 100644 index 0000000..77056bd --- /dev/null +++ b/api/src/utils/helpers/load_drilldowns_to_db.py @@ -0,0 +1,92 @@ +import requests +import pandas as pd +import urllib.parse +from sentence_transformers import SentenceTransformer +import json +from config import POSTGRES_ENGINE + +# ENV Variables + +table_name = 'drilldowns' +schema_name = 'datausa_drilldowns' +embedding_size = 384 + + +def embedding(dataframe, column): + """ + Creates embeddings for text in the column passed as argument + """ + model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') + + model_embeddings = model.encode(dataframe[column].to_list()) + dataframe['embedding'] = model_embeddings.tolist() + + return dataframe + + +def create_table(table_name, schema_name, embedding_size = 384): + POSTGRES_ENGINE.execute(f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (drilldown_id text, drilldown_name text, cube_name text, drilldown text, embedding vector({embedding_size}))") + return + + +def get_data_from_api(api_url): + try: + r = requests.get(api_url) + df = pd.DataFrame.from_dict(r.json()['data']) + except: + raise ValueError('Invalid API url:', api_url) + + return df + + +def get_api_params(api_url): + parsed_url = urllib.parse.urlparse(api_url) + query_params = urllib.parse.parse_qs(parsed_url.query) + + cube = query_params.get('cube', [''])[0] + drilldown = query_params.get('drilldowns', [''])[0] + + cube_name = cube.replace('+', ' ') + drilldown = drilldown.replace('+', ' ') + + return cube_name, drilldown + + +def load_data_to_db(api_url, measure_name, table_name, schema_name): + cube_name, drilldown = get_api_params(api_url) + df = get_data_from_api(api_url=api_url) + + df.rename(columns={f"{drilldown}": "drilldown_name", f"{drilldown} ID": "drilldown_id"}, inplace=True) + + df['cube_name'] = f"{cube_name}" + df['drilldown'] = f"{drilldown}" + df.drop(f"{measure_name}", axis=1, inplace=True) + + if 'drilldown_id' not in df.columns: + df['drilldown_id'] = df['drilldown'] + + df.replace('', pd.NA, inplace=True) + df.dropna(subset=['drilldown_name', 'drilldown_id'], how='all', inplace=True) + + print(df.head()) + + df_embeddings = embedding(df, 'drilldown_name') + df_embeddings.to_sql(table_name, con=POSTGRES_ENGINE, if_exists='append', index=False, schema=schema_name) + + return + + +with open('output.json', 'r') as file: + cubes_json = json.load(file) + +create_table(table_name, schema_name) + +for table in cubes_json['tables']: + cube_name = table['name'] + measure = table['measures'][0]['name'] + for dimension in table['dimensions']: + for hierarchy in dimension['hierarchies']: + for level in hierarchy['levels']: + api_url = f"https://api-dev.datausa.io/tesseract/data.jsonrecords?cube={cube_name}&drilldowns={level}&measures={measure}" + load_data_to_db(api_url, measure, table_name, schema_name) + diff --git a/api/src/utils/helpers/output.json b/api/src/utils/helpers/output.json new file mode 100644 index 0000000..d364278 --- /dev/null +++ b/api/src/utils/helpers/output.json @@ -0,0 +1,1366 @@ +{ + "tables": [ + { + "name": "Data_USA_House_Compact_election", + "api": "Tesseract", + "description": "Table `Data_USA_House_Compact_election` has data on Winner Votes, Other Votes, Total Votes.", + "measures": [ + { + "name": "Winner Votes", + "description": "Contains the winner votes for Data USA House Compact election" + }, + { + "name": "Other Votes", + "description": "Contains the other votes for Data USA House Compact election" + }, + { + "name": "Total Votes", + "description": "Contains the total votes for Data USA House Compact election" + } + ], + "dimensions": [ + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "Geography", + "levels": [ + "State", + "Congressional District" + ] + } + ] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Candidate", + "description": "candidate dimension of the data.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] + }, + { + "name": "Special", + "description": "special dimension of the data.", + "hierarchies": [ + { + "name": "Special", + "levels": [ + "Special" + ] + } + ] + }, + { + "name": "Party", + "description": "party dimension of the data.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] + }, + { + "name": "Runoff", + "description": "runoff dimension of the data.", + "hierarchies": [ + { + "name": "Runoff", + "levels": [ + "Runoff" + ] + } + ] + } + ] + }, + { + "name": "Data_USA_House_election", + "api": "Tesseract", + "description": "Table `Data_USA_House_election` has data on Candidate Votes, Total Votes.", + "measures": [ + { + "name": "Candidate Votes", + "description": "Contains the candidate votes for Data USA House election" + }, + { + "name": "Total Votes", + "description": "Contains the total votes for Data USA House election" + } + ], + "dimensions": [ + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "Geography", + "levels": [ + "State", + "Congressional District" + ] + } + ] + }, + { + "name": "Candidate", + "description": "candidate dimension of the data.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] + }, + { + "name": "Candidate Other", + "description": "candidate other dimension of the data.", + "hierarchies": [ + { + "name": "Candidate Other", + "levels": [ + "Candidate Other" + ] + } + ] + }, + { + "name": "Party", + "description": "party dimension of the data.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] + }, + { + "name": "Special", + "description": "special dimension of the data.", + "hierarchies": [ + { + "name": "Special", + "levels": [ + "Special" + ] + } + ] + }, + { + "name": "Runoff", + "description": "runoff dimension of the data.", + "hierarchies": [ + { + "name": "Runoff", + "levels": [ + "Runoff" + ] + } + ] + }, + { + "name": "Unofficial", + "description": "unofficial dimension of the data.", + "hierarchies": [ + { + "name": "Unofficial", + "levels": [ + "Unofficial" + ] + } + ] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + } + ] + }, + { + "name": "Consumer Price Index - CPI", + "api": "Tesseract", + "description": "Table `Consumer Price Index - CPI` has data on Consumer Price Index, Standard Error, Percent Change.", + "measures": [ + { + "name": "Consumer Price Index", + "description": "Contains the consumer price index for Consumer Price Index - CPI" + }, + { + "name": "Standard Error", + "description": "Contains the standard error for Consumer Price Index - CPI" + }, + { + "name": "Percent Change", + "description": "Contains the percent change for Consumer Price Index - CPI" + } + ], + "dimensions": [ + { + "name": "Time", + "description": "time dimension of the data.", + "hierarchies": [ + { + "name": "Time", + "levels": [ + "Year", + "Month and Year" + ] + } + ] + }, + { + "name": "Product or Service", + "description": "product or service dimension of the data.", + "hierarchies": [ + { + "name": "Product Level 1", + "levels": [ + "Level 1.1" + ] + }, + { + "name": "Product Level 2", + "levels": [ + "Level 2.1", + "Level 2.2" + ] + }, + { + "name": "Product Level 3", + "levels": [ + "Level 3.1", + "Level 3.2", + "Level 3.3" + ] + }, + { + "name": "Product Level 4", + "levels": [ + "Level 4.1", + "Level 4.2", + "Level 4.3", + "Level 4.4" + ] + }, + { + "name": "Product Level 5", + "levels": [ + "Level 5.1", + "Level 5.2", + "Level 5.3", + "Level 5.4", + "Level 5.5" + ] + }, + { + "name": "Product Level 6", + "levels": [ + "Level 6.1", + "Level 6.2", + "Level 6.3", + "Level 6.4", + "Level 6.5", + "Level 6.6" + ] + }, + { + "name": "Product Level 7", + "levels": [ + "Level 7.1", + "Level 7.2", + "Level 7.3", + "Level 7.4", + "Level 7.5", + "Level 7.6", + "Level 7.7" + ] + } + ] + } + ] + }, + { + "name": "bls_growth_industry", + "api": "Tesseract", + "description": "Table `bls_growth_industry` has data on Industry Jobs, Industry Jobs Change, Industry Jobs CARC, Industry Output, Industry Output CARC.", + "measures": [ + { + "name": "Industry Jobs", + "description": "Contains the industry jobs for bls growth industry" + }, + { + "name": "Industry Jobs Change", + "description": "Contains the industry jobs change for bls growth industry" + }, + { + "name": "Industry Jobs CARC", + "description": "Contains the industry jobs carc for bls growth industry" + }, + { + "name": "Industry Output", + "description": "Contains the industry output for bls growth industry" + }, + { + "name": "Industry Output CARC", + "description": "Contains the industry output carc for bls growth industry" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "BLS Industry Flat", + "description": "bls industry flat dimension of the data.", + "hierarchies": [] + } + ] + }, + { + "name": "onet_by_cip", + "api": "Tesseract", + "description": "Table `onet_by_cip` has data on IM Value, LV Value, Total Score.", + "measures": [ + { + "name": "IM Value", + "description": "Contains the im value for onet by cip" + }, + { + "name": "LV Value", + "description": "Contains the lv value for onet by cip" + }, + { + "name": "Total Score", + "description": "Contains the total score for onet by cip" + } + ], + "dimensions": [ + { + "name": "CIP", + "description": "cip dimension of the data.", + "hierarchies": [] + }, + { + "name": "Skill Element", + "description": "skill element dimension of the data.", + "hierarchies": [] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + } + ] + }, + { + "name": "usa_spending", + "api": "Tesseract", + "description": "Table `usa_spending` has data on Obligation Amount, Total Loan Value.", + "measures": [ + { + "name": "Obligation Amount", + "description": "Contains the obligation amount for usa spending" + }, + { + "name": "Total Loan Value", + "description": "Contains the total loan value for usa spending" + } + ], + "dimensions": [ + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "Nation", + "levels": [ + "Nation" + ] + }, + { + "name": "County", + "levels": [ + "State", + "County" + ] + } + ] + }, + { + "name": "Action Date", + "description": "action date dimension of the data.", + "hierarchies": [ + { + "name": "Action Date", + "levels": [ + "Year", + "Quarter", + "Month", + "Day" + ] + } + ] + }, + { + "name": "Fiscal Year", + "description": "fiscal year dimension of the data.", + "hierarchies": [ + { + "name": "Fiscal Year", + "levels": [ + "Fiscal Year" + ] + } + ] + }, + { + "name": "Transaction Type", + "description": "transaction type dimension of the data.", + "hierarchies": [ + { + "name": "Transaction Type", + "levels": [ + "Transaction Type Parent", + "Transaction Type" + ] + } + ] + }, + { + "name": "Agency", + "description": "agency dimension of the data.", + "hierarchies": [ + { + "name": "Agency", + "levels": [ + "Department", + "Agency" + ] + } + ] + }, + { + "name": "Product Service Code", + "description": "product service code dimension of the data.", + "hierarchies": [ + { + "name": "Product Service Code", + "levels": [ + "PSC Group", + "PSC Sub Group" + ] + } + ] + }, + { + "name": "NAPCS", + "description": "napcs dimension of the data.", + "hierarchies": [ + { + "name": "NAPCS", + "levels": [ + "NAPCS Section", + "NAPCS Group", + "NAPCS Class" + ] + } + ] + } + ] + }, + { + "name": "health_opioid_overdose_deathrate", + "api": "Tesseract", + "description": "Table `health_opioid_overdose_deathrate` has data on Opioid overdose death rate.", + "measures": [ + { + "name": "Opioid overdose death rate", + "description": "Contains the opioid overdose death rate for health opioid overdose deathrate" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "Nation", + "levels": [ + "Nation" + ] + }, + { + "name": "State", + "levels": [ + "State" + ] + }, + { + "name": "County", + "levels": [ + "State County", + "County" + ] + } + ] + } + ] + }, + { + "name": "BLS Employment - Industry Only", + "api": "Tesseract", + "description": "Table `BLS Employment - Industry Only` has data on NSA Employees, NSA Average Employees, SA Employees, SA Average Employees.", + "measures": [ + { + "name": "NSA Employees", + "description": "Contains the nsa employees for BLS Employment - Industry Only" + }, + { + "name": "NSA Average Employees", + "description": "Contains the nsa average employees for BLS Employment - Industry Only" + }, + { + "name": "SA Employees", + "description": "Contains the sa employees for BLS Employment - Industry Only" + }, + { + "name": "SA Average Employees", + "description": "Contains the sa average employees for BLS Employment - Industry Only" + } + ], + "dimensions": [ + { + "name": "Time", + "description": "time dimension of the data.", + "hierarchies": [ + { + "name": "Time", + "levels": [ + "Month of Year" + ] + } + ] + }, + { + "name": "Employment State", + "description": "employment state dimension of the data.", + "hierarchies": [ + { + "name": "Employment State", + "levels": [ + "Employment State" + ] + } + ] + }, + { + "name": "Industry", + "description": "industry dimension of the data.", + "hierarchies": [ + { + "name": "Industry", + "levels": [ + "Industry" + ] + } + ] + } + ] + }, + { + "name": "Data_USA_Electoral_College_president", + "api": "Tesseract", + "description": "Table `Data_USA_Electoral_College_president` has data on Electoral College Votes.", + "measures": [ + { + "name": "Electoral College Votes", + "description": "Contains the electoral college votes for Data USA Electoral College president" + } + ], + "dimensions": [ + { + "name": "State", + "description": "state dimension of the data.", + "hierarchies": [] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Party", + "description": "party dimension of the data.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] + } + ] + }, + { + "name": "bea_use", + "api": "Tesseract", + "description": "Table `bea_use` has data on Value Millions.", + "measures": [ + { + "name": "Value Millions", + "description": "Contains the value millions for bea use" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Industry IO Code", + "description": "industry io code dimension of the data.", + "hierarchies": [ + { + "name": "Industry IO Code", + "levels": [ + "Industry L0", + "Industry L1" + ] + } + ] + }, + { + "name": "Commodity IO Code", + "description": "commodity io code dimension of the data.", + "hierarchies": [ + { + "name": "Commodity IO Code", + "levels": [ + "Commodity L0", + "Commodity L1" + ] + } + ] + } + ] + }, + { + "name": "Data_USA_President_election", + "api": "Tesseract", + "description": "Table `Data_USA_President_election` has data on Candidate Votes, Total Votes.", + "measures": [ + { + "name": "Candidate Votes", + "description": "Contains the candidate votes for Data USA President election" + }, + { + "name": "Total Votes", + "description": "Contains the total votes for Data USA President election" + } + ], + "dimensions": [ + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "Nation", + "levels": [ + "Nation" + ] + }, + { + "name": "State", + "levels": [ + "State" + ] + }, + { + "name": "County", + "levels": [ + "State County", + "County" + ] + } + ] + }, + { + "name": "Candidate", + "description": "candidate dimension of the data.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] + }, + { + "name": "Party", + "description": "party dimension of the data.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + } + ] + }, + { + "name": "dot_faf", + "api": "Tesseract", + "description": "Table `dot_faf` has data on Millions Of Dollars, Thousands Of Tons.", + "measures": [ + { + "name": "Millions Of Dollars", + "description": "Contains the millions of dollars for dot faf" + }, + { + "name": "Thousands Of Tons", + "description": "Contains the thousands of tons for dot faf" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Origin", + "description": "origin dimension of the data.", + "hierarchies": [ + { + "name": "Origin", + "levels": [ + "Origin State", + "Origin Region" + ] + } + ] + }, + { + "name": "Destination", + "description": "destination dimension of the data.", + "hierarchies": [ + { + "name": "Destination", + "levels": [ + "Destination State", + "Destination Region" + ] + } + ] + }, + { + "name": "SCTG", + "description": "sctg dimension of the data.", + "hierarchies": [ + { + "name": "SCTG", + "levels": [ + "SCTG2" + ] + } + ] + }, + { + "name": "Transportation Mode", + "description": "transportation mode dimension of the data.", + "hierarchies": [ + { + "name": "Transportation Mode", + "levels": [ + "Transportation Mode" + ] + } + ] + } + ] + }, + { + "name": "BLS Unemployment Insurance Claims - Most Recent", + "api": "Tesseract", + "description": "Table `BLS Unemployment Insurance Claims - Most Recent` has data on Initial Claims, Continued Claims, Covered Employment, Insured Unemployment Rate.", + "measures": [ + { + "name": "Initial Claims", + "description": "Contains the initial claims for BLS Unemployment Insurance Claims - Most Recent" + }, + { + "name": "Continued Claims", + "description": "Contains the continued claims for BLS Unemployment Insurance Claims - Most Recent" + }, + { + "name": "Covered Employment", + "description": "Contains the covered employment for BLS Unemployment Insurance Claims - Most Recent" + }, + { + "name": "Insured Unemployment Rate", + "description": "Contains the insured unemployment rate for BLS Unemployment Insurance Claims - Most Recent" + } + ], + "dimensions": [ + { + "name": "State", + "description": "state dimension of the data.", + "hierarchies": [] + }, + { + "name": "Week Ended", + "description": "week ended dimension of the data.", + "hierarchies": [ + { + "name": "Week Ended", + "levels": [ + "Week Ended" + ] + } + ] + }, + { + "name": "Week Previous", + "description": "week previous dimension of the data.", + "hierarchies": [ + { + "name": "Week Previous", + "levels": [ + "Week Previous" + ] + } + ] + } + ] + }, + { + "name": "ed_defaults", + "api": "Tesseract", + "description": "Table `ed_defaults` has data on Borrowers In Default, Borrowers Entered Repayment, Default Rate.", + "measures": [ + { + "name": "Borrowers In Default", + "description": "Contains the borrowers in default for ed defaults" + }, + { + "name": "Borrowers Entered Repayment", + "description": "Contains the borrowers entered repayment for ed defaults" + }, + { + "name": "Default Rate", + "description": "Contains the default rate for ed defaults" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "Geography", + "levels": [ + "State", + "County" + ] + } + ] + }, + { + "name": "OPEID", + "description": "opeid dimension of the data.", + "hierarchies": [ + { + "name": "OPEID", + "levels": [ + "OPEID" + ] + } + ] + } + ] + }, + { + "name": "bls_ces", + "api": "Tesseract", + "description": "Table `bls_ces` has data on Industry Average Hourly Earnings, Industry Average Weekly Hours, Industry Employees Thousands.", + "measures": [ + { + "name": "Industry Average Hourly Earnings", + "description": "Contains the industry average hourly earnings for bls ces" + }, + { + "name": "Industry Average Weekly Hours", + "description": "Contains the industry average weekly hours for bls ces" + }, + { + "name": "Industry Employees Thousands", + "description": "Contains the industry employees thousands for bls ces" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "BLS Industry Flat", + "description": "bls industry flat dimension of the data.", + "hierarchies": [] + } + ] + }, + { + "name": "health_estimates_of_chronically_homeless_individuals", + "api": "Tesseract", + "description": "Table `health_estimates_of_chronically_homeless_individuals` has data on Estimates of Chronically Homeless Individuals.", + "measures": [ + { + "name": "Estimates of Chronically Homeless Individuals", + "description": "Contains the estimates of chronically homeless individuals for health estimates of chronically homeless individuals" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "Geography", + "description": "geography dimension of the data.", + "hierarchies": [ + { + "name": "State", + "levels": [ + "State" + ] + } + ] + } + ] + }, + { + "name": "Data_USA_Senate_election", + "api": "Tesseract", + "description": "Table `Data_USA_Senate_election` has data on Candidate Votes, Total Votes.", + "measures": [ + { + "name": "Candidate Votes", + "description": "Contains the candidate votes for Data USA Senate election" + }, + { + "name": "Total Votes", + "description": "Contains the total votes for Data USA Senate election" + } + ], + "dimensions": [ + { + "name": "State", + "description": "state dimension of the data.", + "hierarchies": [] + }, + { + "name": "Candidate", + "description": "candidate dimension of the data.", + "hierarchies": [ + { + "name": "Candidate", + "levels": [ + "Candidate" + ] + } + ] + }, + { + "name": "Candidate Other", + "description": "candidate other dimension of the data.", + "hierarchies": [ + { + "name": "Candidate Other", + "levels": [ + "Candidate Other" + ] + } + ] + }, + { + "name": "Party", + "description": "party dimension of the data.", + "hierarchies": [ + { + "name": "Party", + "levels": [ + "Party" + ] + } + ] + }, + { + "name": "Special", + "description": "special dimension of the data.", + "hierarchies": [ + { + "name": "Special", + "levels": [ + "Special" + ] + } + ] + }, + { + "name": "Unofficial", + "description": "unofficial dimension of the data.", + "hierarchies": [ + { + "name": "Unofficial", + "levels": [ + "Unofficial" + ] + } + ] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + } + ] + }, + { + "name": "BLS Employment - Supersector Only", + "api": "Tesseract", + "description": "Table `BLS Employment - Supersector Only` has data on NSA Employees, NSA Average Employees, SA Employees, SA Average Employees.", + "measures": [ + { + "name": "NSA Employees", + "description": "Contains the nsa employees for BLS Employment - Supersector Only" + }, + { + "name": "NSA Average Employees", + "description": "Contains the nsa average employees for BLS Employment - Supersector Only" + }, + { + "name": "SA Employees", + "description": "Contains the sa employees for BLS Employment - Supersector Only" + }, + { + "name": "SA Average Employees", + "description": "Contains the sa average employees for BLS Employment - Supersector Only" + } + ], + "dimensions": [ + { + "name": "Time", + "description": "time dimension of the data.", + "hierarchies": [ + { + "name": "Time", + "levels": [ + "Month of Year" + ] + } + ] + }, + { + "name": "Employment State", + "description": "employment state dimension of the data.", + "hierarchies": [ + { + "name": "Employment State", + "levels": [ + "Employment State" + ] + } + ] + }, + { + "name": "Supersector", + "description": "supersector dimension of the data.", + "hierarchies": [ + { + "name": "Supersector", + "levels": [ + "Supersector" + ] + } + ] + } + ] + }, + { + "name": "BLS Unemployment Insurance Claims", + "api": "Tesseract", + "description": "Table `BLS Unemployment Insurance Claims` has data on Initial Claims, Continued Claims, Covered Employment, Insured Unemployment Rate.", + "measures": [ + { + "name": "Initial Claims", + "description": "Contains the initial claims for BLS Unemployment Insurance Claims" + }, + { + "name": "Continued Claims", + "description": "Contains the continued claims for BLS Unemployment Insurance Claims" + }, + { + "name": "Covered Employment", + "description": "Contains the covered employment for BLS Unemployment Insurance Claims" + }, + { + "name": "Insured Unemployment Rate", + "description": "Contains the insured unemployment rate for BLS Unemployment Insurance Claims" + } + ], + "dimensions": [ + { + "name": "State", + "description": "state dimension of the data.", + "hierarchies": [] + }, + { + "name": "Week Ended", + "description": "week ended dimension of the data.", + "hierarchies": [ + { + "name": "Week Ended", + "levels": [ + "Week Ended" + ] + } + ] + }, + { + "name": "Week Previous", + "description": "week previous dimension of the data.", + "hierarchies": [ + { + "name": "Week Previous", + "levels": [ + "Week Previous" + ] + } + ] + } + ] + }, + { + "name": "onet_by_pums", + "api": "Tesseract", + "description": "Table `onet_by_pums` has data on IM Value, LV Value, Total Score.", + "measures": [ + { + "name": "IM Value", + "description": "Contains the im value for onet by pums" + }, + { + "name": "LV Value", + "description": "Contains the lv value for onet by pums" + }, + { + "name": "Total Score", + "description": "Contains the total score for onet by pums" + } + ], + "dimensions": [ + { + "name": "PUMS Occupation", + "description": "pums occupation dimension of the data.", + "hierarchies": [] + }, + { + "name": "Skill Element", + "description": "skill element dimension of the data.", + "hierarchies": [] + }, + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + } + ] + }, + { + "name": "bls_growth_occupation", + "api": "Tesseract", + "description": "Table `bls_growth_occupation` has data on Occupation Employment, Occupation Employment Percent, Occupation Employment Change, Occupation Employment Change Percent, Occupation Employment Openings.", + "measures": [ + { + "name": "Occupation Employment", + "description": "Contains the occupation employment for bls growth occupation" + }, + { + "name": "Occupation Employment Percent", + "description": "Contains the occupation employment percent for bls growth occupation" + }, + { + "name": "Occupation Employment Change", + "description": "Contains the occupation employment change for bls growth occupation" + }, + { + "name": "Occupation Employment Change Percent", + "description": "Contains the occupation employment change percent for bls growth occupation" + }, + { + "name": "Occupation Employment Openings", + "description": "Contains the occupation employment openings for bls growth occupation" + } + ], + "dimensions": [ + { + "name": "Year", + "description": "year dimension of the data.", + "hierarchies": [ + { + "name": "Year", + "levels": [ + "Year" + ] + } + ] + }, + { + "name": "BLS Occupation Flat", + "description": "bls occupation flat dimension of the data.", + "hierarchies": [] + } + ] + } + ] +} \ No newline at end of file From 3ba6f784aa3e1cf93a75252cc4e72436945ebc70 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Fri, 22 Mar 2024 11:19:19 -0300 Subject: [PATCH 25/29] Add SQL similarity functions --- .../similarity_functions/match_drilldowns.sql | 19 +++++++++++++++++++ .../similarity_functions/match_table.sql | 16 ++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 api/src/utils/similarity_functions/match_drilldowns.sql create mode 100644 api/src/utils/similarity_functions/match_table.sql diff --git a/api/src/utils/similarity_functions/match_drilldowns.sql b/api/src/utils/similarity_functions/match_drilldowns.sql new file mode 100644 index 0000000..f59a2f1 --- /dev/null +++ b/api/src/utils/similarity_functions/match_drilldowns.sql @@ -0,0 +1,19 @@ +CREATE OR REPLACE FUNCTION public.match_drilldowns(query_embedding vector, similarity_threshold double precision, match_count integer, table_name text, drilldown_names text[]) + RETURNS TABLE(drilldown_id text, drilldown_name text, similarity double precision) + LANGUAGE plpgsql +AS $function$ +BEGIN + RETURN QUERY + select + drilldowns.drilldown_id, + drilldowns.drilldown, + 1 - (drilldowns.embedding <=> query_embedding) AS similarity + FROM datasaudi_drilldowns.drilldowns + WHERE drilldowns.cube_name = table_name + AND drilldowns.drilldown = ANY(drilldown_names) + AND 1 - (drilldowns.embedding <=> query_embedding) > similarity_threshold + ORDER BY drilldowns.embedding <=> query_embedding + LIMIT match_count; +END; +$function$ +; diff --git a/api/src/utils/similarity_functions/match_table.sql b/api/src/utils/similarity_functions/match_table.sql new file mode 100644 index 0000000..7493e44 --- /dev/null +++ b/api/src/utils/similarity_functions/match_table.sql @@ -0,0 +1,16 @@ +CREATE OR REPLACE FUNCTION public.match_table(query_embedding vector, similarity_threshold double precision, match_count integer) + RETURNS TABLE(table_name text, similarity double precision) + LANGUAGE plpgsql +AS $function$ +begin + return query + select + cubes.table_name, + 1 - (cubes.embedding <=> query_embedding) as similarity + from datausa_tables.cubes + where 1 - (cubes.embedding <=> query_embedding) > similarity_threshold + order by cubes.embedding <=> query_embedding + limit match_count; +end; +$function$ +; From 4e3d76b89b994c1e8c0be04bb0dab00dbdbd9574 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Fri, 22 Mar 2024 11:19:38 -0300 Subject: [PATCH 26/29] small fixes --- api/src/utils/helpers/load_cubes_to_db.py | 4 ++-- api/src/utils/helpers/load_drilldowns_to_db.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/api/src/utils/helpers/load_cubes_to_db.py b/api/src/utils/helpers/load_cubes_to_db.py index 48dc50d..5ead0e0 100644 --- a/api/src/utils/helpers/load_cubes_to_db.py +++ b/api/src/utils/helpers/load_cubes_to_db.py @@ -43,6 +43,6 @@ def load_data_to_db(df, table_name, schema_name): df = pd.DataFrame(cubes) -create_table() +create_table(table_name, schema_name) -load_data_to_db(df) \ No newline at end of file +load_data_to_db(df, table_name, schema_name) \ No newline at end of file diff --git a/api/src/utils/helpers/load_drilldowns_to_db.py b/api/src/utils/helpers/load_drilldowns_to_db.py index 77056bd..4f68ebd 100644 --- a/api/src/utils/helpers/load_drilldowns_to_db.py +++ b/api/src/utils/helpers/load_drilldowns_to_db.py @@ -3,7 +3,7 @@ import urllib.parse from sentence_transformers import SentenceTransformer import json -from config import POSTGRES_ENGINE +from config import POSTGRES_ENGINE, TESSERACT_API # ENV Variables @@ -63,11 +63,13 @@ def load_data_to_db(api_url, measure_name, table_name, schema_name): df.drop(f"{measure_name}", axis=1, inplace=True) if 'drilldown_id' not in df.columns: - df['drilldown_id'] = df['drilldown'] + df['drilldown_id'] = df['drilldown_name'] df.replace('', pd.NA, inplace=True) df.dropna(subset=['drilldown_name', 'drilldown_id'], how='all', inplace=True) + df = df[['drilldown_id', 'drilldown_name', 'cube_name', 'drilldown']] + df['drilldown_name'] = df['drilldown_name'].astype(str) print(df.head()) df_embeddings = embedding(df, 'drilldown_name') @@ -87,6 +89,5 @@ def load_data_to_db(api_url, measure_name, table_name, schema_name): for dimension in table['dimensions']: for hierarchy in dimension['hierarchies']: for level in hierarchy['levels']: - api_url = f"https://api-dev.datausa.io/tesseract/data.jsonrecords?cube={cube_name}&drilldowns={level}&measures={measure}" - load_data_to_db(api_url, measure, table_name, schema_name) - + api_url = f"{TESSERACT_API}data.jsonrecords?cube={cube_name}&drilldowns={level}&measures={measure}" + load_data_to_db(api_url, measure, table_name, schema_name) \ No newline at end of file From 5e621021ba144c3cceebe63564dd56473ae30f40 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Fri, 22 Mar 2024 11:20:01 -0300 Subject: [PATCH 27/29] add script to map xml schema to custom json --- api/src/utils/helpers/schema_to_json.py | 95 +++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 api/src/utils/helpers/schema_to_json.py diff --git a/api/src/utils/helpers/schema_to_json.py b/api/src/utils/helpers/schema_to_json.py new file mode 100644 index 0000000..2aacb26 --- /dev/null +++ b/api/src/utils/helpers/schema_to_json.py @@ -0,0 +1,95 @@ +import xml.etree.ElementTree as ET +import json +import sys + +def parse_xml_to_json(xml_file): + tree = ET.parse(xml_file) + root = tree.getroot() + + tables = [] + + # Parse Shared Dimensions + shared_dimensions = {} + for shared_dimension in root.findall('.//SharedDimension'): + shared_dimension_name = shared_dimension.get('name') + hierarchy = shared_dimension.find('.//Hierarchy') + hierarchy_name = hierarchy.get('name') + levels = [level.get('name') for level in hierarchy.findall('.//Level')] + shared_dimensions[shared_dimension_name] = {"hierarchy": hierarchy_name, "levels": levels} + + # Parse Cubes + for cube in root.findall('.//Cube'): + table_name = cube.get('name') + + measures = [] + dimensions = [] + table_description = None + + # Parse Measures + for measure in cube.findall('.//Measure'): + measure_name = measure.get('name') + measure_description = measure.find('.//Annotation[@name="caption_en"]') + measure_description = measure_description.text if measure_description is not None else "" + measures.append({"name": measure_name, "description": measure_description}) + + # Parse Cube-specific Dimensions + for dimension in cube.findall('.//Dimension'): + dimension_name = dimension.get('name') + hierarchy_name = dimension.find('.//Hierarchy').get('name') + levels = [level.get('name') for level in dimension.findall('.//Level')] + dimensions.append({"name": dimension_name, "hierarchy": hierarchy_name, "levels": levels}) + + # Parse Dimension Usage (Shared Dimensions) + for dimension_usage in cube.findall('.//DimensionUsage'): + dimension_name = dimension_usage.get('name') + shared_dimension_name = dimension_usage.get('source') + hierarchy_info = shared_dimensions.get(shared_dimension_name) + hierarchy_name = hierarchy_info["hierarchy"] + levels = hierarchy_info["levels"] + dimensions.append({"name": dimension_name, "hierarchy": hierarchy_name, "levels": levels}) + + # Parse Table Description + table_annotation = cube.find('.//Annotation[@name="table_en"]') + if table_annotation is not None: + table_description = table_annotation.text + + tables.append({ + "name": table_name, + "api": "Tesseract", + "description": table_description, + "measures": measures, + "dimensions": [ + { + "name": dimension["name"], + "description": f"{dimension['name']} dimension of the data.", + "hierarchies": [ + { + "name": dimension["hierarchy"], + "levels": dimension["levels"] + } + ] + } for dimension in dimensions + ] + }) + + return {"tables": tables} + + +def main(input_file, output_file): + xml_file = input_file + json_output = parse_xml_to_json(xml_file) + print(json.dumps(json_output, indent=4)) + + with open(output_file, 'w') as f: + json.dump(json_output, f, indent=4) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python schema_to_json.py ") + sys.exit(1) + + input_file = sys.argv[1] + output_file = sys.argv[2] + + main(input_file, output_file) \ No newline at end of file From ab1962370c31c8ca20fbe5b528d442df5ca6ac66 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Wed, 27 Mar 2024 15:34:50 -0300 Subject: [PATCH 28/29] change migration scripts --- README.md | 2 +- api/src/utils/classification/__init__.py | 0 api/src/utils/classification/input_classification.py | 0 api/src/utils/helpers/{ => old}/cube_to_db.py | 0 api/src/utils/helpers/{ => old}/drilldowns_to_db.py | 3 --- api/src/utils/helpers/{ => old}/tesseract_schema.json | 0 api/src/utils/helpers/{ => old}/tesseract_schema_mapping.py | 0 api/src/utils/helpers/schema_to_json.py | 3 +++ 8 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 api/src/utils/classification/__init__.py delete mode 100644 api/src/utils/classification/input_classification.py rename api/src/utils/helpers/{ => old}/cube_to_db.py (100%) rename api/src/utils/helpers/{ => old}/drilldowns_to_db.py (94%) rename api/src/utils/helpers/{ => old}/tesseract_schema.json (100%) rename api/src/utils/helpers/{ => old}/tesseract_schema_mapping.py (100%) diff --git a/README.md b/README.md index 99bb3c7..d49d12c 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ In order to add one cube, the steps are: - The script then appends a column containing embeddings generated from the drilldown names using the same embedding model mentioned before. - This process needs to be repeated for each drilldown level within the cube or those required for making cuts. Time variables don't need to be loaded into the database. -### [For future projects] In progress... +### [Migration for future projects] In progress... To add all the cubes of a project automatically, they can be mapped from the tesseract schema json to the custom format needed in the app. To do this follow these steps: diff --git a/api/src/utils/classification/__init__.py b/api/src/utils/classification/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/src/utils/classification/input_classification.py b/api/src/utils/classification/input_classification.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/src/utils/helpers/cube_to_db.py b/api/src/utils/helpers/old/cube_to_db.py similarity index 100% rename from api/src/utils/helpers/cube_to_db.py rename to api/src/utils/helpers/old/cube_to_db.py diff --git a/api/src/utils/helpers/drilldowns_to_db.py b/api/src/utils/helpers/old/drilldowns_to_db.py similarity index 94% rename from api/src/utils/helpers/drilldowns_to_db.py rename to api/src/utils/helpers/old/drilldowns_to_db.py index 6f6cf6d..13a8712 100644 --- a/api/src/utils/helpers/drilldowns_to_db.py +++ b/api/src/utils/helpers/old/drilldowns_to_db.py @@ -72,8 +72,5 @@ def load_data_to_db(api_url, measure_name): api_url = input() print("Enter measure name: ") measure_name = input() -#df = pd.read_csv('/Users/alexandrabjanes/Datawheel/CODE/datausa-chat/tables.csv') -#print(df.head()) -#create_table() load_data_to_db(api_url, measure_name = measure_name) diff --git a/api/src/utils/helpers/tesseract_schema.json b/api/src/utils/helpers/old/tesseract_schema.json similarity index 100% rename from api/src/utils/helpers/tesseract_schema.json rename to api/src/utils/helpers/old/tesseract_schema.json diff --git a/api/src/utils/helpers/tesseract_schema_mapping.py b/api/src/utils/helpers/old/tesseract_schema_mapping.py similarity index 100% rename from api/src/utils/helpers/tesseract_schema_mapping.py rename to api/src/utils/helpers/old/tesseract_schema_mapping.py diff --git a/api/src/utils/helpers/schema_to_json.py b/api/src/utils/helpers/schema_to_json.py index 2aacb26..b8540df 100644 --- a/api/src/utils/helpers/schema_to_json.py +++ b/api/src/utils/helpers/schema_to_json.py @@ -3,6 +3,9 @@ import sys def parse_xml_to_json(xml_file): + """ + Parses XML schema to custom json format. + """ tree = ET.parse(xml_file) root = tree.getroot() From 9e8012d9b57d1b42f2c953a56c863100d93347d0 Mon Sep 17 00:00:00 2001 From: Alexandra Date: Thu, 28 Mar 2024 11:46:20 -0300 Subject: [PATCH 29/29] Revert "change migration scripts" This reverts commit ab1962370c31c8ca20fbe5b528d442df5ca6ac66. --- README.md | 2 +- api/src/utils/classification/__init__.py | 0 api/src/utils/classification/input_classification.py | 0 api/src/utils/helpers/{old => }/cube_to_db.py | 0 api/src/utils/helpers/{old => }/drilldowns_to_db.py | 3 +++ api/src/utils/helpers/schema_to_json.py | 3 --- api/src/utils/helpers/{old => }/tesseract_schema.json | 0 api/src/utils/helpers/{old => }/tesseract_schema_mapping.py | 0 8 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 api/src/utils/classification/__init__.py create mode 100644 api/src/utils/classification/input_classification.py rename api/src/utils/helpers/{old => }/cube_to_db.py (100%) rename api/src/utils/helpers/{old => }/drilldowns_to_db.py (94%) rename api/src/utils/helpers/{old => }/tesseract_schema.json (100%) rename api/src/utils/helpers/{old => }/tesseract_schema_mapping.py (100%) diff --git a/README.md b/README.md index d49d12c..99bb3c7 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ In order to add one cube, the steps are: - The script then appends a column containing embeddings generated from the drilldown names using the same embedding model mentioned before. - This process needs to be repeated for each drilldown level within the cube or those required for making cuts. Time variables don't need to be loaded into the database. -### [Migration for future projects] In progress... +### [For future projects] In progress... To add all the cubes of a project automatically, they can be mapped from the tesseract schema json to the custom format needed in the app. To do this follow these steps: diff --git a/api/src/utils/classification/__init__.py b/api/src/utils/classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/src/utils/classification/input_classification.py b/api/src/utils/classification/input_classification.py new file mode 100644 index 0000000..e69de29 diff --git a/api/src/utils/helpers/old/cube_to_db.py b/api/src/utils/helpers/cube_to_db.py similarity index 100% rename from api/src/utils/helpers/old/cube_to_db.py rename to api/src/utils/helpers/cube_to_db.py diff --git a/api/src/utils/helpers/old/drilldowns_to_db.py b/api/src/utils/helpers/drilldowns_to_db.py similarity index 94% rename from api/src/utils/helpers/old/drilldowns_to_db.py rename to api/src/utils/helpers/drilldowns_to_db.py index 13a8712..6f6cf6d 100644 --- a/api/src/utils/helpers/old/drilldowns_to_db.py +++ b/api/src/utils/helpers/drilldowns_to_db.py @@ -72,5 +72,8 @@ def load_data_to_db(api_url, measure_name): api_url = input() print("Enter measure name: ") measure_name = input() +#df = pd.read_csv('/Users/alexandrabjanes/Datawheel/CODE/datausa-chat/tables.csv') +#print(df.head()) +#create_table() load_data_to_db(api_url, measure_name = measure_name) diff --git a/api/src/utils/helpers/schema_to_json.py b/api/src/utils/helpers/schema_to_json.py index b8540df..2aacb26 100644 --- a/api/src/utils/helpers/schema_to_json.py +++ b/api/src/utils/helpers/schema_to_json.py @@ -3,9 +3,6 @@ import sys def parse_xml_to_json(xml_file): - """ - Parses XML schema to custom json format. - """ tree = ET.parse(xml_file) root = tree.getroot() diff --git a/api/src/utils/helpers/old/tesseract_schema.json b/api/src/utils/helpers/tesseract_schema.json similarity index 100% rename from api/src/utils/helpers/old/tesseract_schema.json rename to api/src/utils/helpers/tesseract_schema.json diff --git a/api/src/utils/helpers/old/tesseract_schema_mapping.py b/api/src/utils/helpers/tesseract_schema_mapping.py similarity index 100% rename from api/src/utils/helpers/old/tesseract_schema_mapping.py rename to api/src/utils/helpers/tesseract_schema_mapping.py