-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOCR.py
29 lines (18 loc) · 798 Bytes
/
OCR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from pdf2image import convert_from_path
from subprocess import call
import PyPDF2 as pypdf
import os
TESSERACT_CMD = "tesseract.exe"
def firstPage(pdfdir, fileName):
fr = pypdf.PdfFileReader(pdfdir + "/" + fileName, strict=False)
fw = pypdf.PdfFileWriter()
fw.addPage(fr.getPage(0))
name = ".".join(fileName.split(".")[:-1]) + "_fp.pdf"
with open(name, 'wb+') as f:
fw.write(f)
return name
def ejecuta_OCR(arch_pdf, arch_jpg, name_txt):
pages = convert_from_path(arch_pdf, 300) # Convertim pdf a jpg
pages[0].save(arch_jpg, 'JPEG') # Només guardem la primera pàgina
order = TESSERACT_CMD + " " + arch_jpg + " " + name_txt + " --dpi 300 -l spa" # Ejecuta el OCR
call(order, shell= True) # Pasar la imagen a texto (Leer pdf)