Skip to content

Commit

Permalink
LOG ALL THE THINGS
Browse files Browse the repository at this point in the history
  • Loading branch information
odscjames committed Jul 26, 2023
1 parent 199a128 commit 3452c38
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion src/library/refresher.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,18 +462,24 @@ def download_chunk(chunk, blob_service_client, datasets):
download_xml = download_response.content
if download_response.status_code == 200:
try:
logger.debug('START CHARDET url: ' + url + ' and hash: ' + hash + ' and id: ' + id)
detect_result = chardet.detect(download_xml)
logger.debug('END CHARDET url: ' + url + ' and hash: ' + hash + ' and id: ' + id)
charset = detect_result['encoding']
# log error for undetectable charset, prevent PDFs from being downloaded to Unified Platform
if charset is None:
logger.debug('CHARDET HAD NO RESULT url: ' + url + ' and hash: ' + hash + ' and id: ' + id)
db.updateFileAsDownloadError(conn, id, 2)
clean_containers_by_id(blob_service_client, id)
continue
except:
except Exception as e:
logger.debug('CHARDET HAD ERROR url: ' + url + ' and hash: ' + hash + ' and id: ' + id + ' error '+ str(e))
charset = 'UTF-8'
logger.debug('START UPLOAD BLOB url: ' + url + ' and hash: ' + hash + ' and id: ' + id)
blob_client.upload_blob(
download_xml, overwrite=True, encoding=charset)
blob_client.set_blob_tags({"document_id": id})
logger.debug('START SET DB BLOB url: ' + url + ' and hash: ' + hash + ' and id: ' + id)
db.updateFileAsDownloaded(conn, id)
logger.debug('Successfully downloaded url: ' + url + ' and hash: ' + hash + ' and id: ' + id)
else:
Expand Down

0 comments on commit 3452c38

Please sign in to comment.