Skip to content

Commit

Permalink
[incomplete] Begin adding "meeting scraper" logic to sync script
Browse files Browse the repository at this point in the history
  • Loading branch information
MTecknology committed Jul 8, 2024
1 parent cc6fac1 commit 5450533
Show file tree
Hide file tree
Showing 6 changed files with 293 additions and 7 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ site
*.py[cod]
__pycache__/
dist/

# sync
_workspace/
15 changes: 13 additions & 2 deletions sync/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import logging

# Local imports
import sync.db
import sync.collect
import sync.cloudflare
import sync.hugo
import sync.nginx
Expand All @@ -24,10 +26,14 @@ def main():
logging.getLogger().setLevel(options.loglevel.upper())

# Load source data from hugo
logging.debug(f'Reading input data from {options.data}')
hugo_data = sync.hugo.load_yaml(options.data)
logging.debug(f'Reading input data from {options.hugo_data}')
hugo_data = sync.hugo.load_yaml(options.hugo_data)
source_data = sync.hugo.normalize(hugo_data)

# Connect to database if needed by action(s) [-c,]
if (options.collect):
sync.db.open(f'{options.local_data}/cache.db')

# [-n] Generate an nginx map file
if options.genmap:
logging.info(f'Generating nginx map file at {options.mapfile}')
Expand All @@ -38,6 +44,11 @@ def main():
logging.info('Synchronizing DNS with cloudflare')
sync.cloudflare.push_dns(source_data)

# [-c] Collect meeting data from remote feeds
if options.collect:
logging.info('Collecting meeting data')
sync.collect.fetch_all(source_data)


if __name__ == '__main__':
main()
46 changes: 46 additions & 0 deletions sync/collect/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
'''
Collect meeting data from remote sources
'''
import importlib
import logging


class CollectionException(Exception):
'''
Fatal (non-recoverable) feed import error
'''
pass


def refresh(source_url, source_type):
'''
Refresh meeting information from a source_url
'''
collector = importlib.import_module('.' + source_type, __name__)
if not collector:
raise Exception('Unable to load "%s" collector.', source_type)
return collector.refresh(source_url)


def fetch_all(source_data):
'''
Attempt to refresh all feeds found in source_data
'''
# Assemble list of remote sources
sources = []
for subdomain, data in source_data.items():
if 'feed' not in data:
continue
if isinstance(data['feed'], list):
for feed in data['feed']:
sources.append(feed)
else:
sources.append(data['feed'])

# Process feeds
for source in sources:
src_type, src_url = source.split('^')
try:
refresh(src_url, src_type)
except CollectionException as e:
logging.error('Importing feed "%s" failed: %s', src_url, e)
58 changes: 58 additions & 0 deletions sync/collect/tsml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
'''
Collect meeting data from TSML
'''
import hashlib
import logging
import os
import re
import requests

import sync.db
from sync.collect import CollectionException

# Pattern to identify TSML-UI source data
TSML_RE = (
r'<div id="tsml-ui"\s+'
r'data-src="([^"]+\.json)[0-9?]*"\s+'
r'data-timezone="([^"]+)"')


def refresh(source_url):
'''
Pull meeting data from Twelve-Step-Meeting-List (TSML) websites
'''
# Fetch current meeting page
response = requests.get(source_url)
if response.status_code != 200:
raise CollectionException(
'Unexpected response [%s] from %s',
response.status_code, source_url)

# Identify current source data from page
match = re.search(TSML_RE, response.text)
if not match:
raise CollectionException('No TSML source data found in %s', source_url)
data_src = match.group(1)
data_timezone = match.group(2)
source_json = f'{os.path.dirname(source_url)}{data_src}'

# Calculate checksum-based cache file from full url
url_checksum = hashlib.sha256(source_json.encode('utf-8')).hexdigest()
cache_json = f'tsml_{url_checksum}.json'
cache_tz = f'tsml_{url_checksum}.tz'

# Check if this json file has already been cached
if sync.db.exists(cache_json):
logging.info('%s exists in cache; using local copy', source_json)
else:
# Download TSML data
response = requests.get(source_json)
if response.status_code != 200:
raise CollectionException(
'Unexpected response [%s] from %s',
response.status_code, source_json)
sync.db.set(cache_tz, data_timezone)
sync.db.set(cache_json, response.content)
logging.info('Collected meeting data from %s', source_json)

#
156 changes: 156 additions & 0 deletions sync/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
'''
Internal Database
'''
import logging
import os
import pathlib
import sqlite3

# Container for loaded database
open_database = None


def open(dbpath):
'''
Open a sqlite3 database, initializing it if not present
'''
global open_database

# Close any open handler
if open_database:
open_database.close()

# Ensure parent directory exists
parent = pathlib.Path(os.path.dirname(dbpath))
parent.mkdir(parents=True, exist_ok=True)

# Open sqlite3 database
logging.info('Connecting to database at %s', dbpath)
open_database = sqlite3.connect(dbpath)

# Initialize schema
cursor = open_database.cursor()
cursor.executescript('''
BEGIN;
CREATE TABLE IF NOT EXISTS store (
key TEXT PRIMARY KEY,
value BLOB NOT NULL);
CREATE INDEX IF NOT EXISTS idx_store_key ON store (key);
CREATE TABLE IF NOT EXISTS geostore (
key TEXT PRIMARY KEY,
lon REAL NOT NULL,
lat REAL NOT NULL,
value BLOB NOT NULL);
CREATE INDEX IF NOT EXISTS idx_geostore_key ON geostore (key);
CREATE INDEX IF NOT EXISTS idx_geostore_lon ON geostore (lon);
CREATE INDEX IF NOT EXISTS idx_geostore_lat ON geostore (lat);
COMMIT;
''')


def exists(key, table='store'):
'''
Return True if a key exists in data store
'''
global open_database
cursor = open_database.cursor()
cursor.execute(
f'SELECT COUNT(key) FROM {table} WHERE key=?;',
(key,))
rowcount = cursor.fetchall()[0][0]
if rowcount == 0:
return False
elif rowcount == 1:
return True
raise Exception('Duplicate keys detected in data store')


def geo_exists(key):
'''
Return True if a key exists in geo store
'''
return exists(key, 'geostore')


def get(key, default=None, table='store'):
'''
Return stored value for a given key
'''
global open_database
cursor = open_database.cursor()
cursor.execute(
f'SELECT value FROM {table} WHERE key=?;',
(key,))
rows = cursor.fetchall()
if len(rows) == 0:
return default
elif len(rows) == 1:
return rows[0][0]
raise Exception('Duplicate keys detected in data store')


def geo_get(key, default=None):
'''
Return stored geo data for a given key
'''
return get(key, default, 'geostore')


def geo_search(minlon=None, maxlon=None, minlat=None, maxlat=None):
'''
Return results for a given geo search
'''
global open_database
cursor = open_database.cursor()

# Base SQL query
sql_query = 'SELECT key, lon, lat, value FROM geostore WHERE 1=1'

# Append conditions based on provided parameters
if minlon is not None:
sql_query += ' AND lon >= ?'
if maxlon is not None:
sql_query += ' AND lon <= ?'
if minlat is not None:
sql_query += ' AND lat >= ?'
if maxlat is not None:
sql_query += ' AND lat <= ?'

# Add appropriate arguments to query
params = []
if minlon is not None:
params.append(minlon)
if maxlon is not None:
params.append(maxlon)
if minlat is not None:
params.append(minlat)
if maxlat is not None:
params.append(maxlat)

# Fetch search results
cursor.execute(sql_query, tuple(params))
return cursor.fetchall()


def set(key, value):
'''
Add key/value into storage
'''
global open_database
cursor = open_database.cursor()
cursor.execute(
'INSERT INTO store (key, value) VALUES (?, ?);',
(key, value,))
open_database.commit()


def geo_set(key, lon, lan, value):
'''
Add key/value into geo storage
'''
global open_database
cursor = open_database.cursor()
cursor.execute(
'INSERT INTO geostore (key, lon, lat, value) VALUES (?, ?, ?, ?);',
(key, lon, lan, value,))
open_database.commit()
22 changes: 17 additions & 5 deletions sync/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ def parser():
parser = argparse.ArgumentParser(
usage='python3 -m sync [-h] [actions] <options>',
description='Synchronize sober.page data with various destinations',
epilog='At least one [action] must be provided.',
epilog='[*] At least one script action must be specified.',
formatter_class=lambda prog: argparse.HelpFormatter(
prog, max_help_position=30))

# Actions
actions = parser.add_argument_group('actions')
actions = parser.add_argument_group('actions[*]')
actions.add_argument(
'-n',
dest='genmap',
Expand All @@ -29,6 +29,11 @@ def parser():
dest='records',
action='store_true',
help='Synchronize DNS records')
actions.add_argument(
'-c',
dest='collect',
action='store_true',
help='Collect meeting data from remote feeds')

# Options
parser.add_argument(
Expand All @@ -39,12 +44,19 @@ def parser():
default='/etc/nginx/canonical_redirects.map',
help='Location of generated Nginx map file')
parser.add_argument(
'-d',
dest='data',
'-H',
dest='hugo_data',
action='store',
metavar='<path>',
default='./data/domains.yaml',
help='YAML file containing DNS data')
help='Path to hugo file containing DNS data')
parser.add_argument(
'-w',
dest='local_data',
action='store',
metavar='<path>',
default='./_workspace',
help='Local workspace used for importing/caching data')
parser.add_argument(
'-l',
dest='loglevel',
Expand Down

0 comments on commit 5450533

Please sign in to comment.