diff --git a/README.md b/README.md index 07db955..f7c31c9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # ocr2pdf -**Convert images or scans to searchable PDFs!** +**Convert images and scans to searchable PDFs!** --- @@ -18,7 +18,7 @@ I recommend you use either: - The Bash script, which runs the Python script - The Docker image, which runs the Bash script -- A Google Colab or GitHub Actions server, both of which run the Docker container +- A Google Colab or GitHub Actions server, both of which run the Docker image Read on to find out which is best for you! @@ -30,9 +30,9 @@ It's as easy as 1, 2, 3! Get up and going in no time with these options: Are you on mobile or simply want an easy and seamless experience? -1. Open [the app](https://colab.research.google.com/drive/1yss_oypuRisb29_SnqLGgA759slQzNry) in your browser -2. Run the cell to convert your files and/or zipped folders -3. Find the OCR'd files in your [Google Drive](https://drive.google.com/drive/my-drive)`/ocr-pdf` +1. Open [Colab](https://colab.research.google.com/github/ipitio/ocr-pdf/blob/master/colab.ipynb) in your browser +2. Follow the instructions in the notebook +3. Find the OCR'd files in your [Drive](https://drive.google.com/drive/my-drive)`/ocr-pdf` ### Self-hosted: Prebuilt Docker Image diff --git a/colab.ipynb b/colab.ipynb new file mode 100644 index 0000000..47b314d --- /dev/null +++ b/colab.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Colab Edition\n", + "\n", + "
\n", + "\n", + "[![logo](https://ipitio.github.io/ocr-pdf/public/wide.webp)](https://github.com/ipitio/ocr-pdf)\n", + "\n", + "

\n", + " ocr2pdf\n", + "

\n", + "\n", + "**Convert images and scans to searchable PDFs!**\n", + "\n", + "---\n", + "\n", + "[![downloads](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.json&query=%24.downloads&logo=github&logoColor=959da5&labelColor=333a41&label=pulls)](https://github.com/arevindh/pihole-speedtest/pkgs/container/pihole-speedtest) [![build](https://github.com/ipitio/ocr-pdf/actions/workflows/publish.yml/badge.svg)](https://github.com/ipitio/ocr-pdf/actions/workflows/publish.yml)\n", + "\n", + "
\n", + "\n", + "This notebook is meant to be run on [Colab](https://colab.research.google.com/github/ipitio/ocr-pdf/blob/master/colab.ipynb). It will convert your files and can optionally save them to [Drive](https://drive.google.com/drive/my-drive) `/ocr-pdf`. Open the link above for more information.\n", + "\n", + "## Steps\n", + "\n", + "1. Make two new folders, one inside the other\n", + " - The outer one can be named anything, say `pdf`\n", + " - The inner one must be named `todo`\n", + "2. Place your files in the `todo` folder\n", + " - Those by themselves will just be converted\n", + " - Those inside subfolders will also be merged in alphabetical order\n", + "3. Share the outer `pdf` folder with this notebook\n", + " - Zip the folder\n", + " - Open this notebook in [Colab](https://colab.research.google.com/github/ipitio/ocr-pdf/blob/master/colab.ipynb)\n", + " - Run the cell below to be prompted to connect Drive and upload the zip\n", + "\n", + "You'll be offered a zip of the converted (and merged) files to download locally, whether or not Drive was connected\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Connect to Drive\n", + "try:\n", + " from google.colab import files, drive\n", + " drive.mount(\"/content/drive\", force_remount=True)\n", + " drive = True\n", + "except:\n", + " drive = False\n", + "\n", + "# Extract your PDFs\n", + "files.upload()\n", + "\n", + "# Get the name of the zip file\n", + "pdfs = [pdf for pdf in os.listdir() if pdf.endswith(\".zip\")]\n", + "if len(pdfs) == 0:\n", + " raise Exception(\"No ZIP file found\")\n", + "\n", + "# Transform them\n", + "%pip install udocker\n", + "!udocker --allow-root install\n", + "\n", + "for pdf in pdfs:\n", + " !unzip -o \"$pdf\"\n", + " !rm -f \"$pdf\"\n", + " !udocker --allow-root run -v /content/\"$pdf\":/app/pdf ghcr.io/ipitio/ocr-pdf bash predict.sh pdf\n", + " converted = os.listdir(\"$pdf/done\")\n", + "\n", + " # And load\n", + " if drive and len(converted) > 0:\n", + " ![ -d \"drive/MyDrive/ocr-pdf\" ] || mkdir \"drive/MyDrive/ocr-pdf\"\n", + " !\\cp -r \"$pdf/done/\"* \"drive/MyDrive/ocr-pdf/\"\n", + "\n", + " if len(converted) == 1 and os.path.isfile(\"$pdf/done/\" + converted[0]):\n", + " files.download(\"$pdf/done/\" + converted[0])\n", + " elif len(converted) > 0:\n", + " !zip -r \"$pdf.zip\" \"$pdf/done\"\n", + " files.download(\"$pdf.zip\")\n", + " else:\n", + " print(\"No PDFs found\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}