-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[incomplete] Begin adding "meeting scraper" logic to sync script
- Loading branch information
1 parent
cc6fac1
commit 5450533
Showing
6 changed files
with
293 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,3 +7,6 @@ site | |
*.py[cod] | ||
__pycache__/ | ||
dist/ | ||
|
||
# sync | ||
_workspace/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
''' | ||
Collect meeting data from remote sources | ||
''' | ||
import importlib | ||
import logging | ||
|
||
|
||
class CollectionException(Exception): | ||
''' | ||
Fatal (non-recoverable) feed import error | ||
''' | ||
pass | ||
|
||
|
||
def refresh(source_url, source_type): | ||
''' | ||
Refresh meeting information from a source_url | ||
''' | ||
collector = importlib.import_module('.' + source_type, __name__) | ||
if not collector: | ||
raise Exception('Unable to load "%s" collector.', source_type) | ||
return collector.refresh(source_url) | ||
|
||
|
||
def fetch_all(source_data): | ||
''' | ||
Attempt to refresh all feeds found in source_data | ||
''' | ||
# Assemble list of remote sources | ||
sources = [] | ||
for subdomain, data in source_data.items(): | ||
if 'feed' not in data: | ||
continue | ||
if isinstance(data['feed'], list): | ||
for feed in data['feed']: | ||
sources.append(feed) | ||
else: | ||
sources.append(data['feed']) | ||
|
||
# Process feeds | ||
for source in sources: | ||
src_type, src_url = source.split('^') | ||
try: | ||
refresh(src_url, src_type) | ||
except CollectionException as e: | ||
logging.error('Importing feed "%s" failed: %s', src_url, e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
''' | ||
Collect meeting data from TSML | ||
''' | ||
import hashlib | ||
import logging | ||
import os | ||
import re | ||
import requests | ||
|
||
import sync.db | ||
from sync.collect import CollectionException | ||
|
||
# Pattern to identify TSML-UI source data | ||
TSML_RE = ( | ||
r'<div id="tsml-ui"\s+' | ||
r'data-src="([^"]+\.json)[0-9?]*"\s+' | ||
r'data-timezone="([^"]+)"') | ||
|
||
|
||
def refresh(source_url): | ||
''' | ||
Pull meeting data from Twelve-Step-Meeting-List (TSML) websites | ||
''' | ||
# Fetch current meeting page | ||
response = requests.get(source_url) | ||
if response.status_code != 200: | ||
raise CollectionException( | ||
'Unexpected response [%s] from %s', | ||
response.status_code, source_url) | ||
|
||
# Identify current source data from page | ||
match = re.search(TSML_RE, response.text) | ||
if not match: | ||
raise CollectionException('No TSML source data found in %s', source_url) | ||
data_src = match.group(1) | ||
data_timezone = match.group(2) | ||
source_json = f'{os.path.dirname(source_url)}{data_src}' | ||
|
||
# Calculate checksum-based cache file from full url | ||
url_checksum = hashlib.sha256(source_json.encode('utf-8')).hexdigest() | ||
cache_json = f'tsml_{url_checksum}.json' | ||
cache_tz = f'tsml_{url_checksum}.tz' | ||
|
||
# Check if this json file has already been cached | ||
if sync.db.exists(cache_json): | ||
logging.info('%s exists in cache; using local copy', source_json) | ||
else: | ||
# Download TSML data | ||
response = requests.get(source_json) | ||
if response.status_code != 200: | ||
raise CollectionException( | ||
'Unexpected response [%s] from %s', | ||
response.status_code, source_json) | ||
sync.db.set(cache_tz, data_timezone) | ||
sync.db.set(cache_json, response.content) | ||
logging.info('Collected meeting data from %s', source_json) | ||
|
||
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
''' | ||
Internal Database | ||
''' | ||
import logging | ||
import os | ||
import pathlib | ||
import sqlite3 | ||
|
||
# Container for loaded database | ||
open_database = None | ||
|
||
|
||
def open(dbpath): | ||
''' | ||
Open a sqlite3 database, initializing it if not present | ||
''' | ||
global open_database | ||
|
||
# Close any open handler | ||
if open_database: | ||
open_database.close() | ||
|
||
# Ensure parent directory exists | ||
parent = pathlib.Path(os.path.dirname(dbpath)) | ||
parent.mkdir(parents=True, exist_ok=True) | ||
|
||
# Open sqlite3 database | ||
logging.info('Connecting to database at %s', dbpath) | ||
open_database = sqlite3.connect(dbpath) | ||
|
||
# Initialize schema | ||
cursor = open_database.cursor() | ||
cursor.executescript(''' | ||
BEGIN; | ||
CREATE TABLE IF NOT EXISTS store ( | ||
key TEXT PRIMARY KEY, | ||
value BLOB NOT NULL); | ||
CREATE INDEX IF NOT EXISTS idx_store_key ON store (key); | ||
CREATE TABLE IF NOT EXISTS geostore ( | ||
key TEXT PRIMARY KEY, | ||
lon REAL NOT NULL, | ||
lat REAL NOT NULL, | ||
value BLOB NOT NULL); | ||
CREATE INDEX IF NOT EXISTS idx_geostore_key ON geostore (key); | ||
CREATE INDEX IF NOT EXISTS idx_geostore_lon ON geostore (lon); | ||
CREATE INDEX IF NOT EXISTS idx_geostore_lat ON geostore (lat); | ||
COMMIT; | ||
''') | ||
|
||
|
||
def exists(key, table='store'): | ||
''' | ||
Return True if a key exists in data store | ||
''' | ||
global open_database | ||
cursor = open_database.cursor() | ||
cursor.execute( | ||
f'SELECT COUNT(key) FROM {table} WHERE key=?;', | ||
(key,)) | ||
rowcount = cursor.fetchall()[0][0] | ||
if rowcount == 0: | ||
return False | ||
elif rowcount == 1: | ||
return True | ||
raise Exception('Duplicate keys detected in data store') | ||
|
||
|
||
def geo_exists(key): | ||
''' | ||
Return True if a key exists in geo store | ||
''' | ||
return exists(key, 'geostore') | ||
|
||
|
||
def get(key, default=None, table='store'): | ||
''' | ||
Return stored value for a given key | ||
''' | ||
global open_database | ||
cursor = open_database.cursor() | ||
cursor.execute( | ||
f'SELECT value FROM {table} WHERE key=?;', | ||
(key,)) | ||
rows = cursor.fetchall() | ||
if len(rows) == 0: | ||
return default | ||
elif len(rows) == 1: | ||
return rows[0][0] | ||
raise Exception('Duplicate keys detected in data store') | ||
|
||
|
||
def geo_get(key, default=None): | ||
''' | ||
Return stored geo data for a given key | ||
''' | ||
return get(key, default, 'geostore') | ||
|
||
|
||
def geo_search(minlon=None, maxlon=None, minlat=None, maxlat=None): | ||
''' | ||
Return results for a given geo search | ||
''' | ||
global open_database | ||
cursor = open_database.cursor() | ||
|
||
# Base SQL query | ||
sql_query = 'SELECT key, lon, lat, value FROM geostore WHERE 1=1' | ||
|
||
# Append conditions based on provided parameters | ||
if minlon is not None: | ||
sql_query += ' AND lon >= ?' | ||
if maxlon is not None: | ||
sql_query += ' AND lon <= ?' | ||
if minlat is not None: | ||
sql_query += ' AND lat >= ?' | ||
if maxlat is not None: | ||
sql_query += ' AND lat <= ?' | ||
|
||
# Add appropriate arguments to query | ||
params = [] | ||
if minlon is not None: | ||
params.append(minlon) | ||
if maxlon is not None: | ||
params.append(maxlon) | ||
if minlat is not None: | ||
params.append(minlat) | ||
if maxlat is not None: | ||
params.append(maxlat) | ||
|
||
# Fetch search results | ||
cursor.execute(sql_query, tuple(params)) | ||
return cursor.fetchall() | ||
|
||
|
||
def set(key, value): | ||
''' | ||
Add key/value into storage | ||
''' | ||
global open_database | ||
cursor = open_database.cursor() | ||
cursor.execute( | ||
'INSERT INTO store (key, value) VALUES (?, ?);', | ||
(key, value,)) | ||
open_database.commit() | ||
|
||
|
||
def geo_set(key, lon, lan, value): | ||
''' | ||
Add key/value into geo storage | ||
''' | ||
global open_database | ||
cursor = open_database.cursor() | ||
cursor.execute( | ||
'INSERT INTO geostore (key, lon, lat, value) VALUES (?, ?, ?, ?);', | ||
(key, lon, lan, value,)) | ||
open_database.commit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters