From 15d7329c88d55c0f7c0d7acfa224dc17ca3cedac Mon Sep 17 00:00:00 2001 From: Damien Date: Tue, 31 Jul 2018 17:14:22 +0200 Subject: [PATCH 1/9] add live texts parsing to the CRON task --- update_promulgues.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/update_promulgues.sh b/update_promulgues.sh index f9f33b0..22934e0 100755 --- a/update_promulgues.sh +++ b/update_promulgues.sh @@ -7,6 +7,8 @@ pyenv activate lafabrique DATADIR=data senapy-cli doslegs_urls --min-year=$((`date +%Y`)) | tlfp-parse-many $DATADIR --only-promulgated --quiet +senapy-cli doslegs_urls --in-discussion | tlfp-parse-many $DATADIR --quiet +anpy-cli doslegs_urls --in-discussion | tlfp-parse-many $DATADIR --quiet echo python tlfp/generate_dossiers_csv.py $DATADIR From 09834d8a803c1346feb6beeb5b1cfa5e393ca615 Mon Sep 17 00:00:00 2001 From: Damien Date: Tue, 31 Jul 2018 17:39:08 +0200 Subject: [PATCH 2/9] update_promulgues.sh: improve distinction between promulgated and live texts --- update_promulgues.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/update_promulgues.sh b/update_promulgues.sh index 22934e0..af05c43 100755 --- a/update_promulgues.sh +++ b/update_promulgues.sh @@ -6,8 +6,15 @@ pyenv activate lafabrique DATADIR=data +echo "Parsing new promulgated texts..." senapy-cli doslegs_urls --min-year=$((`date +%Y`)) | tlfp-parse-many $DATADIR --only-promulgated --quiet + +echo +echo "Parsing texts in discussion in the Senate..." senapy-cli doslegs_urls --in-discussion | tlfp-parse-many $DATADIR --quiet + +echo +echo "Parsing texts in discussion in the National Assembly..." anpy-cli doslegs_urls --in-discussion | tlfp-parse-many $DATADIR --quiet echo From 1211a072c384f0bf4e3aba46b3ade7331e7cd6d2 Mon Sep 17 00:00:00 2001 From: Damien Date: Tue, 31 Jul 2018 18:30:59 +0200 Subject: [PATCH 3/9] dump live texts to "logs-encours" and fix counting of failing cases --- tlfp/generate_dossiers_csv.py | 10 ++++++++-- tlfp/parse_one.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tlfp/generate_dossiers_csv.py b/tlfp/generate_dossiers_csv.py index dfa72b9..07ea6bc 100644 --- a/tlfp/generate_dossiers_csv.py +++ b/tlfp/generate_dossiers_csv.py @@ -82,15 +82,21 @@ total_promulgues += 1 erreurs = len(glob.glob(os.path.join(API_DIRECTORY, 'logs/*'))) +erreurs_encours = len(glob.glob(os.path.join(API_DIRECTORY, 'logs-encours/*'))) + +total_encours = total_doslegs - total_promulgues +maximum = total_promulgues + erreurs # assume qu'aucun en cours n'echoue print(total_doslegs, 'doslegs in csv') print(total_promulgues, 'promulgués') +print(total_encours, 'en cours') print(erreurs, 'parsings échoués') -print('%.1f%s OK' % (100*total_promulgues/(total_promulgues + erreurs), '%')) +print('%.1f%s OK' % (100*total_promulgues/(total_promulgues + erreurs), '%'), 'de promulgués qui passent') +print('%.1f%s OK' % (100*total_encours/(total_encours + erreurs_encours), '%'), 'de textes en cours qui passent') home_json_final = { "total": total_promulgues, - "encours": total_doslegs - total_promulgues, + "encours": total_encours, "maximum": total_promulgues + erreurs } home_json_data.sort(key=lambda x: -x['total_amendements']) diff --git a/tlfp/parse_one.py b/tlfp/parse_one.py index 4ee7d75..fe7f369 100644 --- a/tlfp/parse_one.py +++ b/tlfp/parse_one.py @@ -125,7 +125,7 @@ def __getattr__(self, attr): sys.stderr = _stderr -def dump_error_log(url, exception, api_dir, log): +def dump_error_log(url, exception, logdir, log): log = log.getvalue() + '\n' + ''.join(traceback.format_tb(exception.__traceback__)) url_id = url.replace('/', '') @@ -134,8 +134,8 @@ def dump_error_log(url, exception, api_dir, log): elif 'senat.fr' in url: url_id = url.split('/')[-1].replace('.html', '') - mkdirs(os.path.join(api_dir, 'logs')) - logfile = os.path.join(api_dir, 'logs', url_id) + mkdirs(logdir) + logfile = os.path.join(logdir, url_id) print('[error] parsing', url, 'failed. Details in', logfile) open(logfile, 'w').write(log) @@ -147,6 +147,7 @@ def process(API_DIRECTORY, url): verbose = '--quiet' not in sys.argv if not disable_cache: enable_requests_cache() + dos = None with log_print(io.StringIO()) as log: try: if verbose: @@ -193,7 +194,10 @@ def process(API_DIRECTORY, url): raise e except Exception as e: # dump log for each failed doslegs in logs/ - dump_error_log(url, e, API_DIRECTORY, log) + logdir = os.path.join(API_DIRECTORY, 'logs') + if dos and not dos.get('url_jo'): + logdir = 'logs-encours' + dump_error_log(url, e, logdir, log) raise e From beaacb845a6bdaf36bd6f10ca6bc14665fe4c02a Mon Sep 17 00:00:00 2001 From: Damien Date: Tue, 31 Jul 2018 19:15:46 +0200 Subject: [PATCH 4/9] tlfp/generate_dossiers_csv.py: improve formatting a little bit more --- tlfp/generate_dossiers_csv.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tlfp/generate_dossiers_csv.py b/tlfp/generate_dossiers_csv.py index 07ea6bc..80f0864 100644 --- a/tlfp/generate_dossiers_csv.py +++ b/tlfp/generate_dossiers_csv.py @@ -88,11 +88,8 @@ maximum = total_promulgues + erreurs # assume qu'aucun en cours n'echoue print(total_doslegs, 'doslegs in csv') -print(total_promulgues, 'promulgués') -print(total_encours, 'en cours') -print(erreurs, 'parsings échoués') -print('%.1f%s OK' % (100*total_promulgues/(total_promulgues + erreurs), '%'), 'de promulgués qui passent') -print('%.1f%s OK' % (100*total_encours/(total_encours + erreurs_encours), '%'), 'de textes en cours qui passent') +print('%.1f%s (%d/%d)' % (100*total_promulgues/(total_promulgues + erreurs), '%', total_promulgues, total_promulgues + erreurs), 'de promulgués qui passent') +print('%.1f%s (%d/%d)' % (100*total_encours/(total_encours + erreurs_encours), '%', total_encours, total_encours + erreurs_encours), 'de textes en cours qui passent') home_json_final = { "total": total_promulgues, From 15d035a91e2f3b52bf89f93a46e85986262de262 Mon Sep 17 00:00:00 2001 From: Damien Date: Wed, 1 Aug 2018 11:11:36 +0200 Subject: [PATCH 5/9] parse_one: fix log dump location for live texts --- tlfp/parse_one.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tlfp/parse_one.py b/tlfp/parse_one.py index fe7f369..3931e11 100644 --- a/tlfp/parse_one.py +++ b/tlfp/parse_one.py @@ -196,7 +196,7 @@ def process(API_DIRECTORY, url): # dump log for each failed doslegs in logs/ logdir = os.path.join(API_DIRECTORY, 'logs') if dos and not dos.get('url_jo'): - logdir = 'logs-encours' + logdir = os.path.join(API_DIRECTORY, 'logs-encours') dump_error_log(url, e, logdir, log) raise e From d6b9362ee28cc900346a3319e11e19f192d612eb Mon Sep 17 00:00:00 2001 From: Damien Date: Wed, 1 Aug 2018 15:42:08 +0200 Subject: [PATCH 6/9] also clean 'logs-encours/{bill}' when a bill now pass --- tlfp/format_data_for_frontend.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tlfp/format_data_for_frontend.py b/tlfp/format_data_for_frontend.py index dba19b3..6523257 100644 --- a/tlfp/format_data_for_frontend.py +++ b/tlfp/format_data_for_frontend.py @@ -22,9 +22,11 @@ def dump_success_log(output_dir, log): f.write(log) textid = output_dir.split('/')[-1] api_dir = output_dir.replace('/' + textid, '') - err_log = os.path.join(api_dir, 'logs', textid) - if os.path.exists(err_log): - os.remove(err_log) + + for err_dir in ('logs', 'logs-encours'): + err_log = os.path.join(api_dir, err_dir, textid) + if os.path.exists(err_log): + os.remove(err_log) def process(dos, OUTPUT_DIR, log=io.StringIO(), skip_already_done=False): From 2cfeb015cae6b08a7aa940e2833179f39d5a45e8 Mon Sep 17 00:00:00 2001 From: Benjamin Ooghe-Tabanou Date: Thu, 30 Aug 2018 19:18:27 +0200 Subject: [PATCH 7/9] simplify maxima/totals --- tlfp/generate_dossiers_csv.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tlfp/generate_dossiers_csv.py b/tlfp/generate_dossiers_csv.py index 80f0864..f7c1707 100644 --- a/tlfp/generate_dossiers_csv.py +++ b/tlfp/generate_dossiers_csv.py @@ -81,20 +81,23 @@ if dos.get('url_jo'): total_promulgues += 1 +total_encours = total_doslegs - total_promulgues + erreurs = len(glob.glob(os.path.join(API_DIRECTORY, 'logs/*'))) erreurs_encours = len(glob.glob(os.path.join(API_DIRECTORY, 'logs-encours/*'))) -total_encours = total_doslegs - total_promulgues -maximum = total_promulgues + erreurs # assume qu'aucun en cours n'echoue +max_promulgues = total_promulgues + erreurs +max_encours = total_encours + erreurs_encours +maximum = max_promulgues + max_encours print(total_doslegs, 'doslegs in csv') -print('%.1f%s (%d/%d)' % (100*total_promulgues/(total_promulgues + erreurs), '%', total_promulgues, total_promulgues + erreurs), 'de promulgués qui passent') -print('%.1f%s (%d/%d)' % (100*total_encours/(total_encours + erreurs_encours), '%', total_encours, total_encours + erreurs_encours), 'de textes en cours qui passent') +print('%.1f%s (%d/%d)' % (100*total_promulgues/max_promulgues, '%', total_promulgues, max_promulgues), 'de promulgués qui passent') +print('%.1f%s (%d/%d)' % (100*total_encours/max_encours, '%', total_encours, max_encours), 'de textes en cours qui passent') home_json_final = { "total": total_promulgues, "encours": total_encours, - "maximum": total_promulgues + erreurs + "maximum": max_promulgues, } home_json_data.sort(key=lambda x: -x['total_amendements']) home_json_final["focus"] = { From 59b467535798ee79aa3b60eb981494591d1fe97b Mon Sep 17 00:00:00 2001 From: Damien Date: Tue, 11 Sep 2018 11:38:53 +0200 Subject: [PATCH 8/9] update parse_one.py with latest version from master to solve merge conflicts --- tlfp/parse_one.py | 126 +++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/tlfp/parse_one.py b/tlfp/parse_one.py index 3931e11..7dfcbf3 100644 --- a/tlfp/parse_one.py +++ b/tlfp/parse_one.py @@ -1,4 +1,4 @@ -import sys, contextlib, io, os, traceback +import sys, io, os, traceback from senapy.dosleg.parser import parse as senapy_parse from anpy.dossier_like_senapy import parse as anpy_parse @@ -11,26 +11,37 @@ from .tools.download_groupes import process as download_groupes from .tools.download_lois_dites import process as download_lois_dites from .tools.download_AN_opendata import process as download_AN_opendata -from .tools.common import debug_file +from .tools.common import debug_file, log_print from .merge import merge_senat_with_an -def download_senat(url, log=sys.stderr, verbose=True): - if verbose: print(' [] download SENAT version') - html = download(url).text - if verbose: print(' [] parse SENAT version') +class ParsingFailedException(Exception): + def __init__(self, exception, logfile): + super().__init__() + self.root_exception = exception + self.logfile = logfile + + +def download_senat(url, log=sys.stderr): + print(' [] download SENAT version') + resp = download(url) + if resp.status_code != 200: + print('WARNING: Invalid response -', resp.status_code) + return + html = resp.text + print(' [] parse SENAT version') senat_dos = senapy_parse(html, url, logfile=log) debug_file(senat_dos, 'debug_senat_dos.json') return senat_dos -def download_an(url, cached_opendata_an, url_senat=False, log=sys.stderr, verbose=True): - if verbose: print(' [] download AN version') - if verbose: print(' [] parse AN version') +def download_an(url, cached_opendata_an, url_senat=False, log=sys.stderr): + print(' [] download AN version') + print(' [] parse AN version') # TODO: do both instead of first - results = anpy_parse(url, logfile=log, verbose=verbose, cached_opendata_an=cached_opendata_an) + results = anpy_parse(url, logfile=log, cached_opendata_an=cached_opendata_an) if not results: - if verbose: print(' WARNING: AN DOS NOT FOUND', url) + print(' WARNING: AN DOS NOT FOUND', url) return an_dos = results[0] if len(results) > 1: @@ -39,7 +50,7 @@ def download_an(url, cached_opendata_an, url_senat=False, log=sys.stderr, verbos if result.get('url_dossier_senat') == url_senat: an_dos = result break - if verbose: print(' WARNING: TOOK FIRST DOSLEG BUT THERE ARE %d OF THEM' % len(results)) + print(' WARNING: TOOK FIRST DOSLEG BUT THERE ARE %d OF THEM' % len(results)) debug_file(an_dos, 'debug_an_dos.json') return an_dos @@ -59,25 +70,25 @@ def are_same_doslegs(senat_dos, an_dos): return False -def download_merged_dos(url, cached_opendata_an, log=sys.stderr, verbose=True): +def download_merged_dos(url, cached_opendata_an, log=sys.stderr): """find dossier from url and returns (the_merged_dosleg, AN_dosleg, SENAT_dosleg)""" if not url.startswith('http') and ('pjl' in url or 'ppl' in url or 'plfss' in url): url = "http://www.senat.fr/dossier-legislatif/%s.html" % url - if verbose: print(' -= DOSLEG URL:', url, '=-') + print(' -= DOSLEG URL:', url, '=-') dos = None an_dos = None senat_dos = None if 'senat.fr' in url: - senat_dos = download_senat(url, verbose=verbose, log=log) + senat_dos = download_senat(url, log=log) if not senat_dos: - if verbose: print(' /!\ INVALID SENAT DOS') + print(' /!\ INVALID SENAT DOS') return None, None, None # Add AN version if there's one if 'url_dossier_assemblee' in senat_dos: - an_dos = download_an(senat_dos['url_dossier_assemblee'], cached_opendata_an, senat_dos['url_dossier_senat'], verbose=verbose, log=log) + an_dos = download_an(senat_dos['url_dossier_assemblee'], cached_opendata_an, senat_dos['url_dossier_senat'], log=log) if not an_dos: return senat_dos, None, senat_dos if 'url_dossier_senat' in an_dos: @@ -86,48 +97,18 @@ def download_merged_dos(url, cached_opendata_an, log=sys.stderr, verbose=True): else: dos = senat_dos elif 'assemblee-nationale.fr' in url: - an_dos = download_an(url, cached_opendata_an, verbose=verbose, log=log) + dos = an_dos = download_an(url, cached_opendata_an, log=log) # Add senat version if there's one if 'url_dossier_senat' in an_dos: senat_dos = download_senat(an_dos['url_dossier_senat'], log=log) - dos = merge_senat_with_an(senat_dos, an_dos) - else: - dos = an_dos + if senat_dos: + dos = merge_senat_with_an(senat_dos, an_dos) else: - if verbose: print(' INVALID URL:', url) + print(' INVALID URL:', url) return dos, an_dos, senat_dos -@contextlib.contextmanager -def log_print(file): - # capture all outputs to a log file while still printing it - class Logger: - def __init__(self, file): - self.terminal = sys.stdout - self.log = file - self.only_log = False - - def write(self, message): - self.terminal.write(message) - self.log.write(message) - - def __getattr__(self, attr): - return getattr(self.terminal, attr) - - logger = Logger(file) - - _stdout = sys.stdout - _stderr = sys.stderr - sys.stdout = logger - sys.stderr = logger - yield logger.log - sys.stdout = _stdout - sys.stderr = _stderr - - def dump_error_log(url, exception, logdir, log): - log = log.getvalue() + '\n' + ''.join(traceback.format_tb(exception.__traceback__)) - url_id = url.replace('/', '') if 'assemblee-nationale' in url: url_id = "%s-%s" % parse_national_assembly_url(url) @@ -137,43 +118,39 @@ def dump_error_log(url, exception, logdir, log): mkdirs(logdir) logfile = os.path.join(logdir, url_id) - print('[error] parsing', url, 'failed. Details in', logfile) - open(logfile, 'w').write(log) + with open(logfile, 'w') as f: + f.write(log.getvalue()) + + print('[error] parsing of', url, 'failed. Details in', logfile) + + raise ParsingFailedException(exception, logfile) def process(API_DIRECTORY, url): - disable_cache = '--enable-cache' not in sys.argv only_promulgated = '--only-promulgated' in sys.argv - verbose = '--quiet' not in sys.argv - if not disable_cache: + quiet = '--quiet' in sys.argv + if '--enable-cache' in sys.argv: enable_requests_cache() - dos = None - with log_print(io.StringIO()) as log: + + with log_print(only_log=quiet) as log: try: - if verbose: - print('======') - print(url) + print('======') + print(url) # download the AN open data or just retrieve the last stored version opendata_an = download_AN_opendata(API_DIRECTORY) - dos, an_dos, senat_dos = download_merged_dos(url, opendata_an, log=log, verbose=verbose) + dos, an_dos, senat_dos = download_merged_dos(url, opendata_an, log=log) if not dos: - return + raise Exception('Nothing found at %s' % url) - if verbose: - print(' title:', dos.get('long_title')) - find_anomalies([dos], verbose=verbose) + find_anomalies([dos]) if not dos.get('url_jo') and only_promulgated: - if verbose: - print(' ----- passed: no JO link') + print(' ----- passed: no JO link') return - if not verbose: - print() - print('======') - print(url) + print(' title:', dos.get('long_title')) debug_file(dos, 'debug_dos.json') @@ -190,15 +167,16 @@ def process(API_DIRECTORY, url): print(' [] format data for the frontend') format_data_for_frontend.process(dos_with_texts, API_DIRECTORY, log=log) - except KeyboardInterrupt as e: + return dos + except KeyboardInterrupt as e: # bypass the error log dump when doing Ctrl-C raise e except Exception as e: + print(*traceback.format_tb(e.__traceback__), e, sep='', file=log) # dump log for each failed doslegs in logs/ logdir = os.path.join(API_DIRECTORY, 'logs') if dos and not dos.get('url_jo'): logdir = os.path.join(API_DIRECTORY, 'logs-encours') dump_error_log(url, e, logdir, log) - raise e if __name__ == '__main__': From cc6a8804c8f0b55b1acd9fb0b5920c63656f6f64 Mon Sep 17 00:00:00 2001 From: Damien Date: Tue, 11 Sep 2018 14:59:44 +0200 Subject: [PATCH 9/9] Revert "update parse_one.py with latest version from master to solve merge conflicts" This reverts commit 59b467535798ee79aa3b60eb981494591d1fe97b. (pushed by accident) --- tlfp/parse_one.py | 126 +++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 52 deletions(-) diff --git a/tlfp/parse_one.py b/tlfp/parse_one.py index 7dfcbf3..3931e11 100644 --- a/tlfp/parse_one.py +++ b/tlfp/parse_one.py @@ -1,4 +1,4 @@ -import sys, io, os, traceback +import sys, contextlib, io, os, traceback from senapy.dosleg.parser import parse as senapy_parse from anpy.dossier_like_senapy import parse as anpy_parse @@ -11,37 +11,26 @@ from .tools.download_groupes import process as download_groupes from .tools.download_lois_dites import process as download_lois_dites from .tools.download_AN_opendata import process as download_AN_opendata -from .tools.common import debug_file, log_print +from .tools.common import debug_file from .merge import merge_senat_with_an -class ParsingFailedException(Exception): - def __init__(self, exception, logfile): - super().__init__() - self.root_exception = exception - self.logfile = logfile - - -def download_senat(url, log=sys.stderr): - print(' [] download SENAT version') - resp = download(url) - if resp.status_code != 200: - print('WARNING: Invalid response -', resp.status_code) - return - html = resp.text - print(' [] parse SENAT version') +def download_senat(url, log=sys.stderr, verbose=True): + if verbose: print(' [] download SENAT version') + html = download(url).text + if verbose: print(' [] parse SENAT version') senat_dos = senapy_parse(html, url, logfile=log) debug_file(senat_dos, 'debug_senat_dos.json') return senat_dos -def download_an(url, cached_opendata_an, url_senat=False, log=sys.stderr): - print(' [] download AN version') - print(' [] parse AN version') +def download_an(url, cached_opendata_an, url_senat=False, log=sys.stderr, verbose=True): + if verbose: print(' [] download AN version') + if verbose: print(' [] parse AN version') # TODO: do both instead of first - results = anpy_parse(url, logfile=log, cached_opendata_an=cached_opendata_an) + results = anpy_parse(url, logfile=log, verbose=verbose, cached_opendata_an=cached_opendata_an) if not results: - print(' WARNING: AN DOS NOT FOUND', url) + if verbose: print(' WARNING: AN DOS NOT FOUND', url) return an_dos = results[0] if len(results) > 1: @@ -50,7 +39,7 @@ def download_an(url, cached_opendata_an, url_senat=False, log=sys.stderr): if result.get('url_dossier_senat') == url_senat: an_dos = result break - print(' WARNING: TOOK FIRST DOSLEG BUT THERE ARE %d OF THEM' % len(results)) + if verbose: print(' WARNING: TOOK FIRST DOSLEG BUT THERE ARE %d OF THEM' % len(results)) debug_file(an_dos, 'debug_an_dos.json') return an_dos @@ -70,25 +59,25 @@ def are_same_doslegs(senat_dos, an_dos): return False -def download_merged_dos(url, cached_opendata_an, log=sys.stderr): +def download_merged_dos(url, cached_opendata_an, log=sys.stderr, verbose=True): """find dossier from url and returns (the_merged_dosleg, AN_dosleg, SENAT_dosleg)""" if not url.startswith('http') and ('pjl' in url or 'ppl' in url or 'plfss' in url): url = "http://www.senat.fr/dossier-legislatif/%s.html" % url - print(' -= DOSLEG URL:', url, '=-') + if verbose: print(' -= DOSLEG URL:', url, '=-') dos = None an_dos = None senat_dos = None if 'senat.fr' in url: - senat_dos = download_senat(url, log=log) + senat_dos = download_senat(url, verbose=verbose, log=log) if not senat_dos: - print(' /!\ INVALID SENAT DOS') + if verbose: print(' /!\ INVALID SENAT DOS') return None, None, None # Add AN version if there's one if 'url_dossier_assemblee' in senat_dos: - an_dos = download_an(senat_dos['url_dossier_assemblee'], cached_opendata_an, senat_dos['url_dossier_senat'], log=log) + an_dos = download_an(senat_dos['url_dossier_assemblee'], cached_opendata_an, senat_dos['url_dossier_senat'], verbose=verbose, log=log) if not an_dos: return senat_dos, None, senat_dos if 'url_dossier_senat' in an_dos: @@ -97,18 +86,48 @@ def download_merged_dos(url, cached_opendata_an, log=sys.stderr): else: dos = senat_dos elif 'assemblee-nationale.fr' in url: - dos = an_dos = download_an(url, cached_opendata_an, log=log) + an_dos = download_an(url, cached_opendata_an, verbose=verbose, log=log) # Add senat version if there's one if 'url_dossier_senat' in an_dos: senat_dos = download_senat(an_dos['url_dossier_senat'], log=log) - if senat_dos: - dos = merge_senat_with_an(senat_dos, an_dos) + dos = merge_senat_with_an(senat_dos, an_dos) + else: + dos = an_dos else: - print(' INVALID URL:', url) + if verbose: print(' INVALID URL:', url) return dos, an_dos, senat_dos +@contextlib.contextmanager +def log_print(file): + # capture all outputs to a log file while still printing it + class Logger: + def __init__(self, file): + self.terminal = sys.stdout + self.log = file + self.only_log = False + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + logger = Logger(file) + + _stdout = sys.stdout + _stderr = sys.stderr + sys.stdout = logger + sys.stderr = logger + yield logger.log + sys.stdout = _stdout + sys.stderr = _stderr + + def dump_error_log(url, exception, logdir, log): + log = log.getvalue() + '\n' + ''.join(traceback.format_tb(exception.__traceback__)) + url_id = url.replace('/', '') if 'assemblee-nationale' in url: url_id = "%s-%s" % parse_national_assembly_url(url) @@ -118,39 +137,43 @@ def dump_error_log(url, exception, logdir, log): mkdirs(logdir) logfile = os.path.join(logdir, url_id) - with open(logfile, 'w') as f: - f.write(log.getvalue()) - - print('[error] parsing of', url, 'failed. Details in', logfile) - - raise ParsingFailedException(exception, logfile) + print('[error] parsing', url, 'failed. Details in', logfile) + open(logfile, 'w').write(log) def process(API_DIRECTORY, url): + disable_cache = '--enable-cache' not in sys.argv only_promulgated = '--only-promulgated' in sys.argv - quiet = '--quiet' in sys.argv - if '--enable-cache' in sys.argv: + verbose = '--quiet' not in sys.argv + if not disable_cache: enable_requests_cache() - - with log_print(only_log=quiet) as log: + dos = None + with log_print(io.StringIO()) as log: try: - print('======') - print(url) + if verbose: + print('======') + print(url) # download the AN open data or just retrieve the last stored version opendata_an = download_AN_opendata(API_DIRECTORY) - dos, an_dos, senat_dos = download_merged_dos(url, opendata_an, log=log) + dos, an_dos, senat_dos = download_merged_dos(url, opendata_an, log=log, verbose=verbose) if not dos: - raise Exception('Nothing found at %s' % url) + return - find_anomalies([dos]) + if verbose: + print(' title:', dos.get('long_title')) + find_anomalies([dos], verbose=verbose) if not dos.get('url_jo') and only_promulgated: - print(' ----- passed: no JO link') + if verbose: + print(' ----- passed: no JO link') return - print(' title:', dos.get('long_title')) + if not verbose: + print() + print('======') + print(url) debug_file(dos, 'debug_dos.json') @@ -167,16 +190,15 @@ def process(API_DIRECTORY, url): print(' [] format data for the frontend') format_data_for_frontend.process(dos_with_texts, API_DIRECTORY, log=log) - return dos - except KeyboardInterrupt as e: # bypass the error log dump when doing Ctrl-C + except KeyboardInterrupt as e: raise e except Exception as e: - print(*traceback.format_tb(e.__traceback__), e, sep='', file=log) # dump log for each failed doslegs in logs/ logdir = os.path.join(API_DIRECTORY, 'logs') if dos and not dos.get('url_jo'): logdir = os.path.join(API_DIRECTORY, 'logs-encours') dump_error_log(url, e, logdir, log) + raise e if __name__ == '__main__':