Normalised (#4)

Normalised
register-dynamics · Oct 9, 2019 · 250ff23 · 250ff23
2 parents 945ca99 + d234c8a
commit 250ff23
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 79 deletions.
diff --git a/Pipfile b/Pipfile
@@ -7,9 +7,10 @@ verify_ssl = true
 
 [packages]
 click = "*"
-petl = "*"
 openpyxl = "*"
 tqdm = "*"
+pylint = "*"
+xlrd = "*"
 
 [requires]
 python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -9,13 +9,22 @@ With Python 3 and [Pipenv][pipenv] installed:
 
 1. Clone this repository and run `pipenv install` within it.
 2. Download the [Code-Point Open][cpo] dataset and unzip it.
-3. Run the script using `python bin/cli.py /path/to/unzipped_directory/ > output.csv`
+3. Activate a local environment using `pipenv shell`.
+4. Run the script using `python bin/cli.py /path/to/unzipped_data/ /path/to/output_dir/`
 
-The script adds columns for lookup fields and a WKT-formatted version of the coordinates.
-The only column it modifies is `Postcode`, formatting it with a single space between the
-outward and inward parts. To remove some of the columns after processing, use a tool like
-csvcut from [csvkit][csvkit].
+The script will write a `.csv` file for every set of regions included in the code-point
+open "code list" files. It will also merge all the data files, converting linked values
+to [CURIE][curie] format. Finally, it modifies the `Postcode` column to format it with a
+single space
+
+This script does the following:
+
+- For each set of regions in the "code list" files, outputs a `.csv` file to create a register from
+- Merges all the data files into `code-point-open.csv`
+- Converts all the linked values in the data files to [CURIE][curie] format
+- Formats the `postcode` column with a single space between outward and inward parts
+- Adds a `geometry` column that is a space-separated combination of `eastings` and `northings`
 
 [cpo]: https://www.ordnancesurvey.co.uk/business-government/products/code-point-open
 [pipenv]: https://pipenv.readthedocs.io/en/latest/
-[csvkit]: https://pipenv.readthedocs.io/en/latest/
+[curie]: https://spec.openregister.org/v2/datatypes/curie
diff --git a/bin/cli.py b/bin/cli.py
@@ -1,16 +1,21 @@
 from os import listdir
 from os.path import join, isfile
-from sys import stdout
+import re
+from collections import OrderedDict
 import csv
 
 import click
-from openpyxl import load_workbook
+import openpyxl
+import xlrd
 from tqdm import tqdm # progress bar
 
-HEADERS_PATH = 'Doc/Code-Point_Open_Column_Headers.csv'
-CODES_PATH = 'Doc/Codelist.xlsx'
-DATA_PATH = 'Data/CSV'
-COUNTRIES = {
+DATA_HEADERS_PATH = 'Doc/Code-Point_Open_Column_Headers.csv'
+REGIONS_PATH = 'Doc/Codelist.xlsx'
+NHS_REGIONS_FULL_PATH = 'Doc/NHS_Codelist.xls'
+DATA_DIR_PATH = 'Data/CSV'
+MERGED_DATA_FILE_NAME = 'code-point-open.csv'
+COUNTRY_CODES_REGISTER_NAME = 'uk-country'
+COUNTRY_CODES = {
   'E92000001': 'England',
   'S92000003': 'Scotland',
   'W92000004': 'Wales',
@@ -20,66 +25,99 @@
 @click.command()
 @click.argument('package_dir', type=click.Path(exists=True,
                 dir_okay=True, file_okay=False))
-def main(package_dir):
-  """Denormalise and improve usability of Code-Point Open data"""
-
-  headers = load_headers(join(package_dir, HEADERS_PATH))
-  codes = load_codes(join(package_dir, CODES_PATH))
-
-  new_headers = headers + ('Country', 'County', 'District', 'Ward', 'Geometry')
-  writer = csv.DictWriter(stdout, fieldnames=new_headers)
-  writer.writeheader()
-
-  data_dir = join(package_dir, DATA_PATH)
-  data_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
-
-  for data_file in tqdm(data_files):
-    with open(join(data_dir, data_file), newline='') as data_file:
-      reader = csv.reader(data_file)
-      for row in reader:
-        data = dict(zip(headers, row))
-        new_data = {
-          **data,
-          'Postcode': format_postcode(data['Postcode']),
-          'Country': codes['countries'].get(data['Country_code']),
-          'County': codes['counties'].get(data['Admin_county_code']),
-          'District': codes['districts'].get(data['Admin_district_code']),
-          'Ward': codes['wards'].get(data['Admin_ward_code']),
-          'Geometry': format_geometry(data['Eastings'], data['Northings']),
-        }
-        writer.writerow(new_data)
-
-def load_headers(headers_path):
-  with open(headers_path, newline='') as headers_file:
-    data = list(csv.reader(headers_file))
-    return tuple(data[1])
-
-def load_codes(codes_path):
-  codes = {'counties': {}}
-  workbook = load_workbook(codes_path)
-
-  codes = {
-    'countries': COUNTRIES,
-    'counties': get_dict_from_sheet(workbook['CTY']),
-    'districts': {
-      **get_dict_from_sheet(workbook['DIS']),
-      **get_dict_from_sheet(workbook['LBO']),
-    },
-    'wards': {
-      **get_dict_from_sheet(workbook['DIW']),
-      **get_dict_from_sheet(workbook['LBW']),
-    },
-  }
-
-  return codes
-
-def get_dict_from_sheet(sheet):
-  data = dict()
-
-  for (value, key) in sheet.iter_rows(min_row=1, max_col=2, values_only=True):
-    data[key] = value
-
-  return data
+@click.argument('output_dir', type=click.Path(exists=True,
+                dir_okay=True, file_okay=False))
+def main(package_dir, output_dir):
+  """Improve usability of Code-Point Open data and link to lookup files"""
+
+  regions_full_path = join(package_dir, REGIONS_PATH)
+
+  # Parse table of contents from regions workbook
+  regions_workbook = openpyxl.load_workbook(regions_full_path)
+  regions_workbook_toc = {row[0]: slugify(row[1])
+                          for row in regions_workbook['AREA_CODES'].values}
+
+  # For each region sheet, write a register-ready CSV file
+  # and build a map of region codes to sheet slugs for curies.
+  region_to_sheet = {}
+  for sheet_code, sheet_slug in regions_workbook_toc.items():
+    header = [sheet_slug, 'name']
+    sheet_file_path = join(output_dir, sheet_slug + '.csv')
+    tqdm.write('Writing region file: ' + sheet_file_path)
+
+    with open(sheet_file_path, 'w', newline='') as sheet_file:
+      region_writer = csv.writer(sheet_file)
+      region_writer.writerow(header)
+
+      for region_name, region_code in regions_workbook[sheet_code].values:
+        region_writer.writerow([region_code, region_name])
+        region_to_sheet[region_code] = sheet_slug
+
+  # Do the same for NHS region workbook, using older library
+  nhs_regions_full_path = join(package_dir, NHS_REGIONS_FULL_PATH)
+  nhs_regions_workbook = xlrd.open_workbook(nhs_regions_full_path)
+  for sheet in nhs_regions_workbook.sheets():
+    sheet_slug = slugify(sheet.name)
+    header = [sheet_slug, 'name']
+    sheet_file_path = join(output_dir, sheet_slug + '.csv')
+    tqdm.write('Writing NHS region file: ' + sheet_file_path)
+
+    with open(sheet_file_path, 'w', newline='') as sheet_file:
+      region_writer = csv.writer(sheet_file)
+      region_writer.writerow(header)
+
+      for region_code, region_name in sheet.get_rows():
+        region_writer.writerow([region_code.value, region_name.value])
+        region_to_sheet[region_code.value] = sheet_slug
+
+  # Write country codes to a register-ready CSV and append to code map
+  country_codes_full_path = join(output_dir, COUNTRY_CODES_REGISTER_NAME + '.csv')
+  with open(country_codes_full_path, 'w', newline='') as country_codes_file:
+    country_codes_writer = csv.writer(country_codes_file)
+    country_codes_writer.writerow([COUNTRY_CODES_REGISTER_NAME, 'name'])
+
+    for country_code, country_name in COUNTRY_CODES.items():
+      country_codes_writer.writerow([country_code, country_name])
+      region_to_sheet[country_code] = COUNTRY_CODES_REGISTER_NAME
+
+  # Load headers to prepend to data from headers file
+  headers = tuple()
+  data_headers_full_path = join(package_dir, DATA_HEADERS_PATH)
+  with open(data_headers_full_path, newline='') as data_headers_file:
+    raw_headers = list(csv.reader(data_headers_file))[1]
+    headers = [slugify(header) for header in raw_headers]
+    headers.append('geometry')
+
+  # For each data file
+  data_dir_full_path = join(package_dir, DATA_DIR_PATH)
+  data_file_paths = [join(data_dir_full_path, filename)
+                     for filename in listdir(data_dir_full_path)
+                     if isfile(join(data_dir_full_path, filename))]
+
+  merged_data_full_path = join(output_dir, MERGED_DATA_FILE_NAME)
+  with open(merged_data_full_path, 'w', newline='') as merged_data_file:
+    merged_data_writer = csv.DictWriter(merged_data_file, fieldnames=headers)
+    merged_data_writer.writeheader()
+
+    for data_file_path in tqdm(data_file_paths):
+      tqdm.write('Processing data file: ' + data_file_path)
+      with open(data_file_path, newline='') as data_file:
+        data_reader = csv.DictReader(data_file, fieldnames=headers)
+        for row in data_reader:
+          new_row = {}
+          for key, value in row.items():
+            if key.endswith('_code') and value in region_to_sheet:
+              new_row[key] = region_to_sheet[value] + ':' + value
+            else:
+              new_row[key] = value
+
+          new_row['postcode'] = format_postcode(new_row['postcode'])
+          new_row['geometry'] = format_geometry(new_row['eastings'], new_row['northings'])
+
+          merged_data_writer.writerow(new_row)
+
+def slugify(input):
+  return input.replace(' ', '-').lower()
 
 def format_postcode(postcode):
   outward = postcode[:-3].strip()