Skip to content

Commit

Permalink
implemented translation of text in directories
Browse files Browse the repository at this point in the history
  • Loading branch information
teotoplak committed Mar 3, 2023
1 parent 5b1d215 commit 7814ec3
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
.openai.env
.DS_Store
data
gcp_key.json
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- pandas==1.5.3
- tiktoken==0.2.0
- beautifulsoup4==4.11.2
- google-cloud-translate==3.11.0
31 changes: 31 additions & 0 deletions src/scripts/translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
from google.cloud import translate_v2 as translate


if __name__ == '__main__':
# set up the translation client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcp_key.json'
translate_client = translate.Client()
# specify the directory containing the text files
input_directory = './data/elections/estonian/'
# specify the output directory for the translated files
output_directory = './data/elections/english/'

for subdir, dirs, files in os.walk(input_directory):
for filename in files:
filepath = os.path.join(subdir, filename)
# only process .txt files
if filepath.endswith('.txt'):
input_file_path = filepath
output_file_path = os.path.join(output_directory, filename)
with open(input_file_path, 'r', encoding='utf-8') as input_file:
text = input_file.read()
# translate the text from Estonian to English
try:
result = translate_client.translate(text, target_language='en', format_='text')
except Exception as e:
print(e)
continue
# write the translated text to the output file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.write(result['translatedText'])
2 changes: 2 additions & 0 deletions tst/crawl/test_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ def test_education_fact_sheet():
]
for chunk in expected_chunks:
assert chunk in chunks

# TODO: https://tarkvalija.eu/sisend-valimisprogrammide-analuusiks-perepoliitika/

0 comments on commit 7814ec3

Please sign in to comment.