From 7814ec3bab96734e8a91ce437e7de7d517b135cb Mon Sep 17 00:00:00 2001 From: teotoplak Date: Fri, 3 Mar 2023 12:08:03 +0100 Subject: [PATCH] implemented translation of text in directories --- .gitignore | 1 + environment.yml | 1 + src/scripts/translate.py | 31 +++++++++++++++++++++++++++++++ tst/crawl/test_crawl.py | 2 ++ 4 files changed, 35 insertions(+) create mode 100644 src/scripts/translate.py diff --git a/.gitignore b/.gitignore index 5c5e8f8..26ade7f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .openai.env .DS_Store data +gcp_key.json \ No newline at end of file diff --git a/environment.yml b/environment.yml index 0e199e6..96e78ab 100644 --- a/environment.yml +++ b/environment.yml @@ -11,3 +11,4 @@ dependencies: - pandas==1.5.3 - tiktoken==0.2.0 - beautifulsoup4==4.11.2 + - google-cloud-translate==3.11.0 diff --git a/src/scripts/translate.py b/src/scripts/translate.py new file mode 100644 index 0000000..c57cfd5 --- /dev/null +++ b/src/scripts/translate.py @@ -0,0 +1,31 @@ +import os +from google.cloud import translate_v2 as translate + + +if __name__ == '__main__': + # set up the translation client + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcp_key.json' + translate_client = translate.Client() + # specify the directory containing the text files + input_directory = './data/elections/estonian/' + # specify the output directory for the translated files + output_directory = './data/elections/english/' + + for subdir, dirs, files in os.walk(input_directory): + for filename in files: + filepath = os.path.join(subdir, filename) + # only process .txt files + if filepath.endswith('.txt'): + input_file_path = filepath + output_file_path = os.path.join(output_directory, filename) + with open(input_file_path, 'r', encoding='utf-8') as input_file: + text = input_file.read() + # translate the text from Estonian to English + try: + result = translate_client.translate(text, target_language='en', format_='text') + except Exception as e: + print(e) + continue + # write the translated text to the output file + with open(output_file_path, 'w', encoding='utf-8') as output_file: + output_file.write(result['translatedText']) diff --git a/tst/crawl/test_crawl.py b/tst/crawl/test_crawl.py index a786281..e3ef389 100644 --- a/tst/crawl/test_crawl.py +++ b/tst/crawl/test_crawl.py @@ -21,3 +21,5 @@ def test_education_fact_sheet(): ] for chunk in expected_chunks: assert chunk in chunks + +# TODO: https://tarkvalija.eu/sisend-valimisprogrammide-analuusiks-perepoliitika/ \ No newline at end of file