|
| 1 | +import os |
| 2 | +import re |
| 3 | +import requests |
| 4 | +import PyPDF2 |
| 5 | + |
| 6 | +def download_pdf(url, local_filename): |
| 7 | + """Download PDF from a URL to a local file.""" |
| 8 | + response = requests.get(url) |
| 9 | + with open(local_filename, 'wb') as f: |
| 10 | + f.write(response.content) |
| 11 | + |
| 12 | +def extract_text_from_pdf(pdf_path): |
| 13 | + """Extract text from a single PDF file.""" |
| 14 | + try: |
| 15 | + with open(pdf_path, 'rb') as file: |
| 16 | + reader = PyPDF2.PdfReader(file) |
| 17 | + text = "" |
| 18 | + for page in reader.pages: |
| 19 | + text += page.extract_text() or "" |
| 20 | + # Apply text cleaning after extraction |
| 21 | + return clean_extracted_text(text) |
| 22 | + except Exception as e: |
| 23 | + print(f"Failed to read {pdf_path}: {e}") |
| 24 | + return None |
| 25 | + |
| 26 | +def clean_extracted_text(text): |
| 27 | + """Clean and format the extracted text.""" |
| 28 | + # Remove line breaks in the middle of sentences |
| 29 | + cleaned_text = re.sub(r'(?<!\.)\n(?!\n)', ' ', text) # Replace single line breaks with space |
| 30 | + # Remove multiple spaces |
| 31 | + cleaned_text = re.sub(r'\s+', ' ', cleaned_text) |
| 32 | + # Preserve paragraphs by keeping double newlines |
| 33 | + cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text) |
| 34 | + return cleaned_text.strip() |
| 35 | + |
| 36 | +def convert_pdf_to_txt(pdf_path, save_to_file=True, output_folder="output_texts"): |
| 37 | + """Convert a single PDF to text, optionally saving to a file.""" |
| 38 | + try: |
| 39 | + # Check if the path is a URL or local file |
| 40 | + if pdf_path.startswith("http"): |
| 41 | + # Download PDF to a temporary location |
| 42 | + local_pdf = os.path.join(output_folder, pdf_path.split('/')[-1]) |
| 43 | + download_pdf(pdf_path, local_pdf) |
| 44 | + text = extract_text_from_pdf(local_pdf) |
| 45 | + os.remove(local_pdf) # Remove the temporary file |
| 46 | + else: |
| 47 | + # Handle local file |
| 48 | + text = extract_text_from_pdf(pdf_path) |
| 49 | + |
| 50 | + if text: |
| 51 | + # Print the cleaned text |
| 52 | + print(f"\nExtracted text:\n{text}\n") |
| 53 | + |
| 54 | + if save_to_file: |
| 55 | + # Save the extracted text to a .txt file |
| 56 | + if not os.path.exists(output_folder): |
| 57 | + os.makedirs(output_folder) |
| 58 | + base_name = os.path.splitext(os.path.basename(pdf_path))[0] |
| 59 | + output_file = os.path.join(output_folder, f"{base_name}.txt") |
| 60 | + with open(output_file, 'w', encoding='utf-8') as txt_file: |
| 61 | + txt_file.write(text) |
| 62 | + print(f"Text successfully saved to: {output_file}") |
| 63 | + else: |
| 64 | + print(f"Could not extract text from: {pdf_path}") |
| 65 | + except Exception as e: |
| 66 | + print(f"Error processing {pdf_path}: {e}") |
| 67 | + |
| 68 | +# Example usage: |
| 69 | + |
| 70 | +#example pdf from internet |
| 71 | +#pdf = "https://fase.org.br/wp-content/uploads/2014/05/exemplo-de-pdf.pdf" |
| 72 | + |
| 73 | +#example local pdf |
| 74 | +pdf = "D:/repos/Python-Scripts/PDF to text/Atividade 28 Fev.pdf" |
| 75 | + |
| 76 | +# Convert PDF to text and save the cleaned text to a file |
| 77 | +convert_pdf_to_txt(pdf) |
0 commit comments