diff --git a/src/library/refresher.py b/src/library/refresher.py index 860d1af..1ba97ae 100644 --- a/src/library/refresher.py +++ b/src/library/refresher.py @@ -462,18 +462,24 @@ def download_chunk(chunk, blob_service_client, datasets): download_xml = download_response.content if download_response.status_code == 200: try: + logger.debug('START CHARDET url: ' + url + ' and hash: ' + hash + ' and id: ' + id) detect_result = chardet.detect(download_xml) + logger.debug('END CHARDET url: ' + url + ' and hash: ' + hash + ' and id: ' + id) charset = detect_result['encoding'] # log error for undetectable charset, prevent PDFs from being downloaded to Unified Platform if charset is None: + logger.debug('CHARDET HAD NO RESULT url: ' + url + ' and hash: ' + hash + ' and id: ' + id) db.updateFileAsDownloadError(conn, id, 2) clean_containers_by_id(blob_service_client, id) continue - except: + except Exception as e: + logger.debug('CHARDET HAD ERROR url: ' + url + ' and hash: ' + hash + ' and id: ' + id + ' error '+ str(e)) charset = 'UTF-8' + logger.debug('START UPLOAD BLOB url: ' + url + ' and hash: ' + hash + ' and id: ' + id) blob_client.upload_blob( download_xml, overwrite=True, encoding=charset) blob_client.set_blob_tags({"document_id": id}) + logger.debug('START SET DB BLOB url: ' + url + ' and hash: ' + hash + ' and id: ' + id) db.updateFileAsDownloaded(conn, id) logger.debug('Successfully downloaded url: ' + url + ' and hash: ' + hash + ' and id: ' + id) else: