Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mime magic for extensionless files #99

Open
wants to merge 3 commits into
base: extensionless-filename
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 66 additions & 41 deletions textract/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,64 +14,89 @@
# the command line interface
DEFAULT_ENCODING = 'utf_8'

# Dictionary structure for synonymous file extension types
EXTENSION_SYNONYMS = {
".jpeg": ".jpg",
".htm": ".html",
}


def _get_extension(filename):
"""This function is used to get the extension of a filename, independent of
whether the filename has an extension and return it in a preditable format
(lower-case and always with a leading period).
"""

# get the filename extension, which is something like .docx for
# example, and import the module dynamically using importlib. This
# is a relative import so the name of the package is necessary
_, ext = os.path.splitext(filename)
ext = ext.lower()

# if the extension doesn't exist, check the mimetype of the filename
if not ext:
try:
mimetype = magic.from_file(filename, mime=True)
except magic.MagicException:
raise exceptions.MimetypeNotDetected(filename)
ext = mimetypes.guess_extension(mimetype)
if ext is None:
raise exceptions.UnknownMimetypeExtension(filename, mimetype)

# check the EXTENSION_SYNONYMS dictionary and otherwise return the current
# extension
return EXTENSION_SYNONYMS.get(ext, ext)
# get the extension(s) from mimetype of the file
try:
mimetype = magic.from_file(filename, mime=True)
except magic.MagicException:
print exceptions.MimetypeNotDetected(filename)
ext = mimetypes.guess_all_extensions(mimetype)
if ext is None:
raise exceptions.UnknownMimetypeExtension(filename, mimetype)
return ext

def _check_mime(filename):
""" This function checks the magic of a file referenced by ``filename``
returns possible filename extension(s) based on detected mimetype
(could be a string or a list)
"""
ext = []
try:
mimetype = magic.from_file(filename, mime=True)
if mimetype:
mimetype = mimetype.split(";")[0]
if not mimetype:
mimetype = None
except:
mimetype = None
ext = mimetypes.guess_all_extensions(mimetype)
return ext

def process(filename, encoding=DEFAULT_ENCODING, **kwargs):
"""This is the core function used for extracting text. It routes the
``filename`` to the appropriate parser and returns the extracted
text as a byte-string encoded with ``encoding``.
"""

# make sure the filename exists
if not os.path.exists(filename):
raise exceptions.MissingFileError(filename)

# First we try to use file extension
_, ext1 = os.path.splitext(filename)
ext1 = ext1.lower()
#print("ext1: %s" % ext1)
# to avoid conflicts with packages that are installed globally
# (e.g. python's json module), all extension parser modules have
# the _parser extension
ext = _get_extension(filename)
rel_module = ext + '_parser'
module_name = rel_module[1:]

# if this module name doesn't exist in this directory it isn't
# currently supported
this_dir = os.path.dirname(os.path.abspath(__file__))
if not os.path.exists(os.path.join(this_dir, module_name + '.py')):
raise exceptions.ExtensionNotSupported(ext)
ext3 = []
ext2 = _check_mime(filename)
#print("ext2: %s" % ext2)

if isinstance(ext2, list):
for i in ext2:
ext3.append(i)
else:
ext3.append(ext2)

if isinstance(ext1, list):
for i in ext1:
ext3.append(i)
else:
ext3.append(ext1)
for ext in ext3:
#print ("Processing ext: %s" % ext)
if ext:
rel_module = ext + '_parser'
module_name = rel_module[1:]

# do the extraction
filetype_module = importlib.import_module(rel_module, 'textract.parsers')
parser = filetype_module.Parser()
return parser.process(filename, encoding, **kwargs)
# if this module name doesn't exist in this directory it isn't
# currently supported
this_dir = os.path.dirname(os.path.abspath(__file__))
if os.path.exists(os.path.join(this_dir, module_name + '.py')):
#raise exceptions.ExtensionNotSupported(ext)

# do the extraction
filetype_module = importlib.import_module(rel_module, 'textract.parsers')
#print ("Processing : %s" % filetype_module)
parser = filetype_module.Parser()
ret = parser.process(filename, encoding, **kwargs)
if ret:
return ret
else:
continue
else:
raise exceptions.MimetypeNotDetected(filename)