diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d3688b6..1e8772fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 6.20.13 - Dec 27, 2024 + +* Sanitize phone number for US people scrape. + ## 6.20.12 - Nov 22, 2024 * Use transformers to trim incoming strings at import that are too long for DB columns: diff --git a/openstates/cli/convert_us.py b/openstates/cli/convert_us.py index 5ab1208c..c0fe433d 100644 --- a/openstates/cli/convert_us.py +++ b/openstates/cli/convert_us.py @@ -1,3 +1,4 @@ +import re import typing import uuid from collections import defaultdict @@ -28,7 +29,22 @@ def make_org_id(id_: str) -> str: return "ocd-organization/" + str(uuid.uuid5(US_UUID_NAMESPACE, id_)) +def sanitize_phone(phone: str) -> str: + """Remove trail text, toll-free phone number or N/A""" + if phone.lower() in ["n/a", "same as above"]: + return "" + # Some phone might appear like (123) 456 7890 + pattern = r"\((\d{3})\)\s*(\d{3})-(\d{4})" + match = re.search(pattern, phone) + if match: + # Format the first matched number as XXX-XXX-XXXX + formatted_number = f"{match.group(1)}-{match.group(2)}-{match.group(3)}" + return formatted_number + return phone + + def _fix_bad_dashes(phone: str) -> str: + phone = sanitize_phone(phone) return phone.replace("–", "-") diff --git a/pyproject.toml b/pyproject.toml index d843ac48..fcc15d27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "openstates" -version = "6.20.12" +version = "6.20.13" description = "core infrastructure for the openstates project" authors = ["James Turk "] license = "MIT"