diff --git a/Dockerfile b/Dockerfile index 338a5ee..91cabcf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,61 +1,43 @@ # start with a base image -FROM ubuntu:14.04 - -# install dependencies -RUN apt-get update -RUN apt-get install -y autoconf automake libtool -RUN apt-get install -y libpng12-dev -RUN apt-get install -y libjpeg62-dev -RUN apt-get install -y g++ -RUN apt-get install -y libtiff4-dev -RUN apt-get install -y libopencv-dev libtesseract-dev -RUN apt-get install -y git -RUN apt-get install -y cmake -RUN apt-get install -y build-essential -RUN apt-get install -y libleptonica-dev -RUN apt-get install -y liblog4cplus-dev -RUN apt-get install -y libcurl3-dev -RUN apt-get install -y python2.7-dev -RUN apt-get install -y tk8.5 tcl8.5 tk8.5-dev tcl8.5-dev -RUN apt-get build-dep -y python-imaging --fix-missing -RUN apt-get install -y imagemagick -RUN apt-get install -y wget -RUN apt-get install -y python python-pip - -# build leptonica -RUN wget http://www.leptonica.org/source/leptonica-1.70.tar.gz -RUN tar -zxvf leptonica-1.70.tar.gz -WORKDIR leptonica-1.70/ -RUN ./autobuild -RUN ./configure -RUN make -RUN make install -RUN ldconfig -WORKDIR / -RUN ls - -ADD requirements.txt / -RUN pip install -r requirements.txt - -# build tesseract -RUN wget https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.02.tar.gz -RUN tar -zxvf tesseract-ocr-3.02.02.tar.gz -WORKDIR tesseract-ocr/ -RUN ./autogen.sh -RUN ./configure -RUN make -RUN make install -RUN ldconfig -RUN cd .. - -# download the relevant Tesseract English Language Packages -RUN wget https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eng.tar.gz -RUN tar -xf tesseract-ocr-3.02.eng.tar.gz -RUN sudo cp -r tesseract-ocr/tessdata /usr/local/share/ +FROM tesseractshadow/tesseract4re + +# Turn off debconf messages during build +ENV DEBIAN_FRONTEND noninteractive +ENV TERM linux + +# Install system dependencies +# Docker says run apt-get update and install together, +# and then rm /var/lib/apt/lists to reduce image size. +RUN apt-get update && apt-get install -y \ + python3-pil \ + python3-requests \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --upgrade pip + + +# Add requirements.txt before rest of repo, for caching +COPY requirements.txt / +RUN pip3 install -r /requirements.txt + # update working directories -ADD ./flask_server /flask_server +# ADD . /app +COPY ./flask_server /flask_server WORKDIR /flask_server +# Make debconf interactive in the running container +ENV DEBIAN_FRONTEND teletype + +# Set useful ENV vars +ENV PYTHONIOENCODING "utf-8" + +# Try to forward request and error logs to docker log collector +# Not sure this works. Use /var/log/nginx/* if running nginx. +RUN ln -sf /dev/stdout /var/log/access.log \ + && ln -sf /dev/stderr /var/log/error.log + +# Expose and run EXPOSE 80 -CMD ["python", "app.py"] \ No newline at end of file +CMD ["python3", "app.py"] diff --git a/README.md b/README.md index 7025ab1..fe733dc 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,104 @@ -Welcome! +## Python OCR Docker -Check out the blog post here >> https://realpython.com/blog/python/setting-up-a-simple-ocr-server/ +This is my branch of a +[RealPython Tutorial](https://realpython.com/blog/python/setting-up-a-simple-ocr-server/). +I've updated it for Python3 / Unicode, and dropped most of the non-Docker bits +so I can inherit from the tesseractshadow/tesseract4re Docker container. +The result is a simple OCR Docker app using Tesseract. It provides: +* cli.py - a command-line app that takes a URL and returns the text extracted from the image. +* app.py - a small Flask app that does the same thing, but in the browser. +The only preprocessing is a single call to ImageFilter.SHARPEN. + + +### Changes include: +* Updated for Python3 & unicode. +* Refactored to use tesseractshadow/tesseract4re Docker container. +* Allows file:/// URLs. (Relative to the _container!_) + +### Alternatives +* [tesseract-ocr-re](https://github.com/tesseract-shadow/tesseract-ocr-re) +* [tleyden/open-ocr](https://github.com/tleyden/open-ocr) Full-featured queued service. Written in Go. + + +### Usage + +Install Docker + +If you are not familiar with Docker please read [Docker - Get Started.](https://docs.docker.com/get-started/). + +#### Quick Start: CLI +The following should run the Docker container straight from DockerHub. + +``` +docker container run --publish 5000:5000 --interactive --tty ctwardy/python_ocr_tutorial:2.0 python3 cli.py +``` + +That should produce the following output: +``` +===OOOO=====CCCCC===RRRRRR===== +==OO==OO===CC=======RR===RR==== +==OO==OO===CC=======RR===RR==== +==OO==OO===CC=======RRRRRR===== +==OO==OO===CC=======RR==RR===== +==OO==OO===CC=======RR== RR==== +===OOOO=====CCCCC===RR====RR=== + +A simple OCR utility +What is the url of the image you would like to analyze? +``` + +Type the following: +``` +file:///flask_server/tests/advertisement.jpg +``` + +to see: +``` +The raw output from tesseract with no processing is: +-----------------BEGIN----------------- +b'ADVERTISEMENT.\n\nTus publication of the Works of Joan Knox, it is\nsupposed, will extend to Five Volumes. It was thought\nadvisable to commence the series with his History of\nthe Reformation in Scotland, as the work of greatest\nimportance. The next volume will thus contain the\nThird and Fourth Books, which continue the History to\nthe year 1564; at which period his historical labours\nmay be considered to terminate. But the Fifth Book,\nforming a sequel to the History, and published under\nhis name in 1644, will also be included. His Letters\nand Miscellaneous Writings will be arranged in the\nsubsequent volumes, as nearly as possible in chronolo-\ngical order ; each portion being introduced by a separate\nnotice, respecting the manuscript or printed copies from\nwhich they have been taken.\n\nIt may perhaps be expected that a Life of the Author\nshould have been prefixed to this volume. The Life of\nKnox, by Dr. M\xe2\x80\x98Crig, is however a work so universally\nknown, and of so much historical value, as to supersede\nany attempt that might be made for a detailed bio-' +------------------END------------------ +``` + +#### Quick Start: App +Runs another app from the same container, pulled from DockerHub: +``` +docker container run --publish 5000:5000 ctwardy/python_ocr_tutorial:2.0 +``` +Then visit `localhost:5000` in your browser and type `file:///flask_server/tests/advertisement.jpg` to +see essentially the same result, but rendered more nicely. Click "Again" to try another. + +Stop it with Ctrl-C as usual. + +#### Get this from GitHub +``` +git clone https://github.com/ctwardy/python_ocr_tutorial.git +``` + +#### Build the Docker image: +From the python_ocr_tutorial folder, do: +```docker image build --tag python_ocr_test .``` +(Note the trailing "." -- it means build from this folder.) + +#### Pull the Docker image: +In case you want to pull but not run. +```docker pull ctwardy/python_ocr_tutorial:2.0``` + +#### Run either the CLI or the App: +We did these above, but for reference. +* CLI: `docker container run --publish 5000:5000 --interactive --tty + ctwardy/python_ocr_tutorial:2.0 python3 cli.py` +* App: `docker container run --publish 5000:5000 ctwardy/python_ocr_tutorial:2.0` + +Stop the app using Ctrl-C. + +### TODO (or see GitHub Issues): +* Browser app still won't display "<" characters. The returned JSON is fine, +so it's getting sanitized in the javascript or browser. +* Add endpoint for POSTing images directly. +* Add file upload widget. +* Try Tesseract4 instead of Tesseract3: tesseractshadow/tesseract4cmp :) \ No newline at end of file diff --git a/_app.sh b/_app.sh deleted file mode 100644 index c7566b2..0000000 --- a/_app.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -# Setup App -wget https://github.com/rhgraysonii/ocr_tutorial/archive/v0.tar.gz -tar -xf v0.tar.gz -mv ocr_tutorial-0/* ../home/ -cd ../home -sudo apt-get install -y python-virtualenv -virtualenv env -source env/bin/activate -pip install -r requirements.txt diff --git a/_run.sh b/_run.sh deleted file mode 100644 index a00aca9..0000000 --- a/_run.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/sh - -# Install Dependencies -sudo apt-get update -sudo apt-get install -y autoconf automake libtool -sudo apt-get install -y libpng12-dev -sudo apt-get install -y libjpeg62-dev -sudo apt-get install -y g++ -sudo apt-get install -y libtiff4-dev -sudo apt-get install -y libopencv-dev libtesseract-dev -sudo apt-get install -y git -sudo apt-get install -y cmake -sudo apt-get install -y build-essential -sudo apt-get install -y libleptonica-dev -sudo apt-get install -y liblog4cplus-dev -sudo apt-get install -y libcurl3-dev -sudo apt-get install -y python2.7-dev -sudo apt-get install -y tk8.5 tcl8.5 tk8.5-dev tcl8.5-dev -sudo apt-get build-dep -y python-imaging --fix-missing -sudo apt-get install -y imagemagick - - -# Build Leptonica -wget http://www.leptonica.org/source/leptonica-1.70.tar.gz -tar -zxvf leptonica-1.70.tar.gz -cd leptonica-1.70/ -./autobuild -./configure -make -sudo make install -sudo ldconfig - - -# Build Tesseract -cd .. -wget https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.02.tar.gz -tar -zxvf tesseract-ocr-3.02.02.tar.gz -cd tesseract-ocr/ -./autogen.sh -./configure -make -sudo make install -sudo ldconfig - - -# Set Environment Variable -TESSDATA_PREFIX=/usr/local/share/ - - -# Download the relevant Tesseract English Language Packages -cd .. -wget https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eng.tar.gz -tar -xf tesseract-ocr-3.02.eng.tar.gz -sudo cp -r tesseract-ocr/tessdata $TESSDATA_PREFIX diff --git a/flask_server/Procfile b/flask_server/Procfile index 2e35818..4106ba6 100644 --- a/flask_server/Procfile +++ b/flask_server/Procfile @@ -1 +1 @@ -web: python app.py +web: python3 app.py diff --git a/flask_server/Procfile.dev b/flask_server/Procfile.dev index 211eaa3..428a1fb 100644 --- a/flask_server/Procfile.dev +++ b/flask_server/Procfile.dev @@ -1,4 +1,4 @@ # Procfile.dev - development # Use the Flask development server. -web: python app.py +web: python3 app.py diff --git a/flask_server/app.py b/flask_server/app.py index fbd9d39..9d1e200 100644 --- a/flask_server/app.py +++ b/flask_server/app.py @@ -1,7 +1,10 @@ +# -*- coding: utf-8 -*- + import os import logging from logging import Formatter, FileHandler from flask import Flask, request, jsonify, render_template +import json from ocr import process_image @@ -16,27 +19,50 @@ def main(): @app.route('/v{}/ocr'.format(_VERSION), methods=["POST"]) def ocr(): + + # Read the URL + try: + url = request.get_json()['image_url'] + except TypeError: + print("TypeError trying get_json(). Trying to load from string.") + try: + data = json.loads(request.data.decode('utf-8'), encoding='utf-8') + url = data['img_url'] + except: + return jsonify( + {"error": "Could not get 'image_url' from the request object. Use JSON?", + "data": request.data} + ) + except: + return jsonify( + {"error": "Non-TypeError. Did you send {'image_url': 'http://.....'}", + "data": request.data } + ) + + # Process the image + print("URL extracted:", url) try: - url = request.json['image_url'] - if 'jpg' in url: - output = process_image(url) - return jsonify({"output": output}) - else: - return jsonify({"error": "only .jpg files, please"}) + output = process_image(url) + except OSError: + return jsonify({"error": "URL not recognized as image.", + "url": url}) except: return jsonify( - {"error": "Did you mean to send: {'image_url': 'some_jpeg_url'}"} + {"error": "Unknown processing image.", + "request": request.data} ) + app.logger.info(output) + return jsonify({"output": output}) @app.errorhandler(500) def internal_error(error): - print str(error) # ghetto logging + print("*** 500 ***\n{}".format(str(error))) # ghetto logging @app.errorhandler(404) def not_found_error(error): - print str(error) + print("*** 404 ***\n{}".format(str(error))) if not app.debug: file_handler = FileHandler('error.log') @@ -52,4 +78,5 @@ def not_found_error(error): if __name__ == '__main__': port = int(os.environ.get('PORT', 5000)) + print("Started app.py on port: {port}") app.run(host='0.0.0.0', port=port) diff --git a/flask_server/cli.py b/flask_server/cli.py index d1ff9f4..a73a498 100644 --- a/flask_server/cli.py +++ b/flask_server/cli.py @@ -1,29 +1,34 @@ +# -*- coding: utf-8 -*- + import sys -import requests import pytesseract from PIL import Image -from StringIO import StringIO - +from io import BytesIO +from localfile import session # Ensure requests.get() handles local files. def get_image(url): - return Image.open(StringIO(requests.get(url).content)) + return Image.open(BytesIO(session.get(url).content)) + +def std_print(s, end="\n"): + sys.stdout.write("{}{}".format(s, end)) if __name__ == '__main__': """Tool to test the raw output of pytesseract with a given input URL""" - sys.stdout.write(""" -===OOOO=====CCCCC===RRRRRR=====\n -==OO==OO===CC=======RR===RR====\n -==OO==OO===CC=======RR===RR====\n -==OO==OO===CC=======RRRRRR=====\n -==OO==OO===CC=======RR==RR=====\n -==OO==OO===CC=======RR== RR====\n -===OOOO=====CCCCC===RR====RR===\n\n + + std_print(""" +===OOOO=====CCCCC===RRRRRR===== +==OO==OO===CC=======RR===RR==== +==OO==OO===CC=======RR===RR==== +==OO==OO===CC=======RRRRRR===== +==OO==OO===CC=======RR==RR===== +==OO==OO===CC=======RR== RR==== +===OOOO=====CCCCC===RR====RR=== """) - sys.stdout.write("A simple OCR utility\n") - url = raw_input("What is the url of the image you would like to analyze?\n") + std_print("A simple OCR utility") + url = input("What is the url of the image you would like to analyze?") image = get_image(url) - sys.stdout.write("The raw output from tesseract with no processing is:\n\n") - sys.stdout.write("-----------------BEGIN-----------------\n") - sys.stdout.write(pytesseract.image_to_string(image) + "\n") - sys.stdout.write("------------------END------------------\n") + std_print("The raw output from tesseract with no processing is:") + std_print("-----------------BEGIN-----------------") + std_print(pytesseract.image_to_string(image).encode('utf-8')) + std_print("------------------END------------------") diff --git a/flask_server/localfile.py b/flask_server/localfile.py new file mode 100644 index 0000000..fda96c9 --- /dev/null +++ b/flask_server/localfile.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +"""Extend requests.get() to handle local files. + + Using ssokolow's response to: + https://stackoverflow.com/questions/10123929/python-requests-fetch-a-file-from-a-local-url + +""" + +import requests +import os, sys + +if sys.version_info.major < 3: + from urllib import url2pathname +else: + from urllib.request import url2pathname + +class LocalFileAdapter(requests.adapters.BaseAdapter): + """Protocol Adapter to allow Requests to GET file:// URLs + + @todo: Properly handle non-empty hostname portions. + """ + + @staticmethod + def _chkpath(method, path): + """Return an HTTP status for the given filesystem path.""" + if method.lower() in ('put', 'delete'): + return 501, "Not Implemented" # TODO + elif method.lower() not in ('get', 'head'): + return 405, "Method Not Allowed" + elif os.path.isdir(path): + return 400, "Path Not A File" + elif not os.path.isfile(path): + return 404, "File Not Found" + elif not os.access(path, os.R_OK): + return 403, "Access Denied" + else: + return 200, "OK" + + def send(self, req, **kwargs): # pylint: disable=unused-argument + """Return the file specified by the given request + + @type req: C{PreparedRequest} + @todo: Should I bother filling `response.headers` and processing + If-Modified-Since and friends using `os.stat`? + """ + path = os.path.normcase(os.path.normpath(url2pathname(req.path_url))) + response = requests.Response() + + response.status_code, response.reason = self._chkpath(req.method, path) + if response.status_code == 200 and req.method.lower() != 'head': + try: + response.raw = open(path, 'rb') + except (OSError, IOError) as err: + response.status_code = 500 + response.reason = str(err) + + if isinstance(req.url, bytes): + response.url = req.url.decode('utf-8') + else: + response.url = req.url + + response.request = req + response.connection = self + + return response + + def close(self): + pass + + +# Create a session and attach the local file adapter +session = requests.session() +session.mount('file://', LocalFileAdapter()) + diff --git a/flask_server/ocr.py b/flask_server/ocr.py index 161abca..64d9bc5 100644 --- a/flask_server/ocr.py +++ b/flask_server/ocr.py @@ -1,9 +1,11 @@ +# -*- coding: utf-8 -*- + import pytesseract -import requests +from localfile import session + from PIL import Image from PIL import ImageFilter -from StringIO import StringIO - +from io import BytesIO def process_image(url): image = _get_image(url) @@ -11,5 +13,5 @@ def process_image(url): return pytesseract.image_to_string(image) -def _get_image(url): - return Image.open(StringIO(requests.get(url).content)) +def _get_image(url, session=session): + return Image.open(BytesIO(session.get(url).content)) diff --git a/flask_server/static/js/script.js b/flask_server/static/js/script.js index 79ac340..6d9560e 100755 --- a/flask_server/static/js/script.js +++ b/flask_server/static/js/script.js @@ -23,8 +23,11 @@ $(function() { $("#retry").show() $("#results").show() $("#results").html("