oa-get

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv, progressbar
import filetype

# import magic
import urllib3
import importlib
import sys
from urllib.error import HTTPError
from urllib.parse import urlparse
from urllib.request import urlopen
from urllib.request import Request

from os import path
from sys import argv, stderr
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

BUFSIZE = 1024000  # (1024KB)

from model import session
from model import Base
from model import set_source
from model import Article
from model import Journal
from model import SupplementaryMaterial
from sources import pmc_doi


# Bind the MetaData object to a specific database engine
engine = create_engine("sqlite:///mydata.sqlite")
Base.metadata.bind = engine

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

# Create the database tables
Base.metadata.create_all(engine)

try:
    action = argv[1]
    target = argv[2]
except IndexError:  # no arguments given
    print(
        """
oa-get – Open Access Media Importer download operations

usage:  oa-get detect-duplicates [source] |
        oa-get download-metadata [source] |
        oa-get download-media [source] |
        oa-get update-mimetypes [source]

"""
    )
    exit(1)

try:
    assert action in [
        "detect-duplicates",
        "download-media",
        "download-metadata",
        "update-mimetypes",
    ]
except AssertionError:  # invalid action
    print("Unknown action “%s”.\n" % action)
    exit(2)

sys.path.insert(0, "../OpenAccessMediaImporterBot/sources")

try:
    source_module = importlib.import_module(target.replace(".py", ""))
except ImportError:  # invalid source
    print("Unknown source “%s”.\n" % target)
    exit(3)
# try:
#    source_module = getattr("%s", target)
# except AttributeError:  # invalid source
#    print("Unknown source “%s”.\n" % target)
#    exit(3)

set_source(target)
# setup_all(True)

from helpers import config, mediawiki, filename_from_url

if action == "detect-duplicates":
    materials = (
        session.query(SupplementaryMaterial)
        .filter(
            (SupplementaryMaterial.mimetype_reported == "audio")
            | (SupplementaryMaterial.mimetype_reported == "video")
        )
        .all()
    )
    if len(materials) == 0:
        print("No audio or video materials found.\n")
        exit(5)
    for material in materials:
        if mediawiki.is_uploaded(material):
            print(
                "[X] %s %s %s\n"
                % (material.article.doi, material.title, material.label)
            )

        else:
            print(
                "[ ] %s %s %s\n"
                % (material.article.doi, material.title, material.label)
            )


def check_mime_types():
    materials = (
        session.query(SupplementaryMaterial)
        .filter_by(mimetype_reported=None, mime_subtype_reported=None)
        .all()
    )
    free_materials = [
        material
        for material in materials
        if material.article.license_url in config.free_license_urls
    ]
    for material in free_materials:
        file_path = material.file_path
        file_type = filetype.guess(file_path)
        if file_type is not None:
            mime_type = file_type.mime
            mime_subtype = file_type.extension

    session.commit()


if action == "update-mimetypes":
    materials = (
        session.query(SupplementaryMaterial)
        .filter_by(mimetype_reported=None, mime_subtype_reported=None)
        .all()
    )
    free_materials = [
        material
        for material in materials
        if material.article.license_url in config.free_license_urls
    ]
    for material in free_materials:
        file_path = material.file_path
        file_type = filetype.guess(file_path)
        if file_type is not None:
            mime_type = file_type.mime
            mime_subtype = file_type.extension
            # Update material's mimetype_reported and mime_subtype_reported fields
    print("Checking MIME types …\n")
    try:
        widgets = [
            progressbar.SimpleProgress(),
            " ",
            progressbar.Percentage(),
            " ",
            progressbar.Bar(),
            " ",
            progressbar.ETA(),
        ]
        p = progressbar.ProgressBar(maxval=len(materials), widgets=widgets).start()
    except AssertionError:
        print("No materials found where MIME type has to be checked.\n")
        exit(0)
    for material in materials:
        url = material.url
        request = Request(url, None, {"User-Agent": "oa-get/2012-10-26"})
        request.headers["Range"] = "bytes=%s-%s" % (0, 11)
        # 12 bytes should be enough to detect audio or video resources
        # <http://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern>
        p.update(p.currval + 1)
        try:
            chunk = urlopen(request).read()
        except HTTPError as e:
            print(
                "When trying to download <%s>, the following error occured: “%s”.\n"
                % (url.encode("utf-8"), str(e))
            )
            continue
        detected_mimetype = ms.buffer(chunk)
        if "Document, corrupt" in detected_mimetype:  # partial MS Office document
            request = Request(url, None, {"User-Agent": "oa-get/2013-05-24"})
            chunk = urlopen(request, timeout=15).read()
            detected_mimetype = ms.buffer(chunk)
            if detected_mimetype == "application/msword":
                # MS Office documents are all detected as application/msword, therefore guess based on extension
                # <http://www.mediawiki.org/wiki/Manual_talk:Mime_type_detection#Fix_for_MS_Office_File_Confusion>
                if url.endswith("ppt") or url.endswith("PPT"):
                    detected_mimetype = "application/vnd.ms-powerpoint"
                elif url.endswith("xls") or url.endswith("XLS"):
                    detected_mimetype = "application/vnd.ms-excel"
        reported_mimetype = material.mimetype + "/" + material.mime_subtype
        if (
            detected_mimetype == "application/octet-stream"
        ):  # general binary MIME type, useless
            detected_mimetype = reported_mimetype

        def get_mimetype(mimetype):
            try:
                return mimetype.split("/")[0]
            except IndexError:
                pass

        def get_mime_subtype(mimetype):
            try:
                return mimetype.split("/")[1]
            except IndexError:
                pass

        if detected_mimetype != reported_mimetype:
            material.mimetype_reported = material.mimetype
            material.mime_subtype_reported = material.mime_subtype
            material.mimetype = get_mimetype(detected_mimetype) or material.mimetype
            material.mime_subtype = (
                get_mime_subtype(detected_mimetype) or material.mime_subtype
            )
            print(
                "DOI %s, %s, source claimed %s but is %s.\n"
                % (
                    material.article.doi,
                    material.url,
                    reported_mimetype,
                    detected_mimetype,
                )
            )
        else:
            material.mimetype_reported = get_mimetype(reported_mimetype)
            material.mime_subtype_reported = get_mime_subtype(reported_mimetype)
        session.commit()

if action == "download-metadata":
    source_path = config.get_metadata_raw_source_path(target)
    url = None
    for result in source_module.download_metadata(source_path):
        if result["url"] != url:
            url = result["url"]
            print(
                "Downloading “%s”, saving into directory “%s” …\n" % (url, source_path)
            )
            p = progressbar.ProgressBar(maxval=result["total"]).start()
        p.update(result["completed"])

if action == "download-media":
    media_path = config.get_media_raw_source_path(target)
    materials = session.query(SupplementaryMaterial).all()
    print(len(materials))

    for material in materials:
        license_url = material.article.license_url
        if license_url == "":
            continue
        if not license_url in config.free_license_urls:
            print("Unknown, possibly non-free license: <%s>\n" % license_url)
            continue

        mimetype = mimetype = material.mimetype
        if mimetype not in ["audio", "video"]:
            continue

        if mediawiki.is_uploaded(material):
            print(
                "Skipping <%s>, already exists at %s.\n"
                % (material.url, mediawiki.get_wiki_name())
            )
            material.uploaded = True
            continue

        url = material.url
        try:
            req = Request(url, None, {"User-Agent": "oa-get/2012-07-21"})
            remote_file = urlopen(req)
        except HTTPError as e:
            print(
                "When trying to download <%s>, the following error occured: “%s”.\n"
                % (url.encode("utf-8"), str(e))
            )
            exit(4)
        total = int(remote_file.headers["content-length"])
        completed = 0

        local_filename = path.join(media_path, filename_from_url(url))

        # if local file has same size as remote file, skip download
        try:
            if path.getsize(local_filename) == total:
                print("Skipping download of <%s>.\n" % url.encode("utf-8"))
                material.downloaded = True
                session.commit()
                continue
        except OSError:  # local file does not exist
            pass

        print(
            "Downloading <%s>, saving as “%s” …\n"
            % (url.encode("utf-8"), local_filename.encode("utf-8"))
        )
        p = progressbar.ProgressBar(maxval=total).start()

        with open(local_filename, "wb") as local_file:
            while True:
                chunk = remote_file.read(BUFSIZE)
                if chunk != "":
                    local_file.write(chunk)
                    completed += len(chunk)
                    p.update(completed)
                else:
                    break

                material.downloaded = True
                session.commit()