-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Image extraction and conversion to formats Multi parallel file execution for all forms so you can input multiple files quickly Any file at all pdf using libreoffice, super powerful Sadly makes docker image larger but worth it OCR PDF using ocr my pdf Works awesomely for adding text to a image Improved compression using ocr my pdf app Settings page with custom download options such as - open in same window - open in new window - download - download as zip Update detection in settings page it should show notification if there is a update (very hidden) UI cleanups Add other image formats to PDF to Image Various fies to icons, and pdf.js usage
- Loading branch information
Showing
54 changed files
with
82,343 additions
and
8,316 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,58 @@ | ||
# Build jbig2enc in a separate stage | ||
FROM debian:bullseye-slim as jbig2enc_builder | ||
|
||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends \ | ||
git \ | ||
automake \ | ||
autoconf \ | ||
libtool \ | ||
libleptonica-dev \ | ||
pkg-config \ | ||
ca-certificates \ | ||
zlib1g-dev \ | ||
make \ | ||
g++ | ||
|
||
RUN git clone https://github.com/agl/jbig2enc && \ | ||
cd jbig2enc && \ | ||
./autogen.sh && \ | ||
./configure && \ | ||
make && \ | ||
make install | ||
|
||
# Main stage | ||
FROM openjdk:17-jdk-slim | ||
|
||
# Install necessary dependencies | ||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends \ | ||
libreoffice-core \ | ||
libreoffice-common \ | ||
libreoffice-writer \ | ||
libreoffice-calc \ | ||
libreoffice-impress \ | ||
python3-uno \ | ||
python3-pip \ | ||
unoconv \ | ||
pngquant \ | ||
ocrmypdf && \ | ||
pip install --user --upgrade ocrmypdf | ||
|
||
# Copy the jbig2enc binary from the builder stage | ||
COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2 | ||
|
||
# Copy the application JAR file | ||
COPY build/libs/*.jar app.jar | ||
|
||
# Expose the application port | ||
EXPOSE 8080 | ||
|
||
# Set environment variables | ||
ENV LOG_LEVEL=INFO | ||
ENTRYPOINT ["java","-jar","/app.jar","-Dlogging.level=${LOG_LEVEL}"] | ||
|
||
# Run the application | ||
ENTRYPOINT ["java","-jar","/app.jar","-Dlogging.level=${LOG_LEVEL}"] | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# OCR Language Packs and Setup | ||
|
||
This document provides instructions on how to add additional language packs for the OCR tab in Stirling-PDF, both inside and outside of Docker. | ||
|
||
## How does the OCR Work | ||
Stirling-PDF uses OCRmyPDF which in turn uses tesseract for its text recognition. | ||
All credit goes to them for this awesome work! | ||
|
||
## Language Packs | ||
|
||
Tesseract OCR supports a variety of languages. You can find additional language packs in the Tesseract GitHub repositories: | ||
|
||
- [tessdata_fast](https://github.com/tesseract-ocr/tessdata_fast): These language packs are smaller and faster to load, but may provide lower recognition accuracy. | ||
- [tessdata](https://github.com/tesseract-ocr/tessdata): These language packs are larger and provide better recognition accuracy, but may take longer to load. | ||
|
||
Depending on your requirements, you can choose the appropriate language pack for your use case. By default Stirling-PDF uses the tessdata_fast eng but this can be replaced. | ||
|
||
### Installing Language Packs | ||
|
||
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need. | ||
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata` | ||
|
||
#### Docker | ||
|
||
If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs. | ||
#### Docker Compose | ||
Modify your `docker-compose.yml` file to include the following volume configuration: | ||
|
||
|
||
```yaml | ||
services: | ||
your_service_name: | ||
image: your_docker_image_name | ||
volumes: | ||
- /usr/share/tesseract-ocr/4.00/tessdata:/location/of/trainingData | ||
``` | ||
#### Docker run | ||
Add the following to your existing docker run command | ||
```bash | ||
-v /usr/share/tesseract-ocr/4.00/tessdata:/location/of/trainingData | ||
``` | ||
|
||
#### Non-Docker | ||
If you are not using Docker, you need to install the OCR components, including the ocrmypdf app. | ||
You can see [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
src/main/java/stirling/software/SPDF/LibreOfficeListener.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package stirling.software.SPDF; | ||
import java.io.IOException; | ||
import java.net.InetSocketAddress; | ||
import java.net.Socket; | ||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.Executors; | ||
|
||
public class LibreOfficeListener { | ||
|
||
private static final LibreOfficeListener INSTANCE = new LibreOfficeListener(); | ||
|
||
private static final long ACTIVITY_TIMEOUT = 20 * 60 * 1000; // 20 minutes | ||
private static final int LISTENER_PORT = 2002; | ||
|
||
private ExecutorService executorService; | ||
private Process process; | ||
private long lastActivityTime; | ||
|
||
private LibreOfficeListener() {} | ||
|
||
public static LibreOfficeListener getInstance() { | ||
return INSTANCE; | ||
} | ||
|
||
public void start() throws IOException { | ||
// Check if the listener is already running | ||
if (process != null && process.isAlive()) { | ||
return; | ||
} | ||
|
||
// Start the listener process | ||
process = Runtime.getRuntime().exec("unoconv --listener"); | ||
lastActivityTime = System.currentTimeMillis(); | ||
|
||
// Start a background thread to monitor the activity timeout | ||
executorService = Executors.newSingleThreadExecutor(); | ||
executorService.submit(() -> { | ||
while (true) { | ||
long idleTime = System.currentTimeMillis() - lastActivityTime; | ||
if (idleTime >= ACTIVITY_TIMEOUT) { | ||
// If there has been no activity for too long, tear down the listener | ||
process.destroy(); | ||
break; | ||
} | ||
try { | ||
Thread.sleep(5000); // Check for inactivity every 5 seconds | ||
} catch (InterruptedException e) { | ||
break; | ||
} | ||
} | ||
}); | ||
|
||
|
||
// Wait for the listener to start up | ||
long startTime = System.currentTimeMillis(); | ||
long timeout = 30000; // Timeout after 30 seconds | ||
while (System.currentTimeMillis() - startTime < timeout) { | ||
if (isListenerRunning()) { | ||
|
||
lastActivityTime = System.currentTimeMillis(); | ||
return; | ||
} | ||
try { | ||
Thread.sleep(1000); | ||
} catch (InterruptedException e) { | ||
// TODO Auto-generated catch block | ||
e.printStackTrace(); | ||
} // Check every 1 second | ||
} | ||
} | ||
|
||
private boolean isListenerRunning() { | ||
try { | ||
System.out.println("waiting for listener to start"); | ||
Socket socket = new Socket(); | ||
socket.connect(new InetSocketAddress("localhost", 2002), 1000); // Timeout after 1 second | ||
socket.close(); | ||
return true; | ||
} catch (IOException e) { | ||
return false; | ||
} | ||
} | ||
|
||
public synchronized void stop() { | ||
// Stop the activity timeout monitor thread | ||
executorService.shutdownNow(); | ||
|
||
// Stop the listener process | ||
if (process != null && process.isAlive()) { | ||
process.destroy(); | ||
} | ||
} | ||
|
||
} |
14 changes: 14 additions & 0 deletions
14
src/main/java/stirling/software/SPDF/config/AppConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package stirling.software.SPDF.config; | ||
|
||
import org.springframework.context.annotation.Bean; | ||
import org.springframework.context.annotation.Configuration; | ||
|
||
|
||
@Configuration | ||
public class AppConfig { | ||
@Bean(name = "appVersion") | ||
public String appVersion() { | ||
String version = getClass().getPackage().getImplementationVersion(); | ||
return (version != null) ? version : "0.3.3"; | ||
} | ||
} |
Oops, something went wrong.