diff --git a/.flake8 b/.flake8 index 2bcd70e..8dd399a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,3 @@ [flake8] max-line-length = 88 +extend-ignore = E203 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..20fad9f --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# Apply style guidelines +0294ee15e9f6f0d9e18134370703acb845bdb0cd diff --git a/.gitignore b/.gitignore index 01d81c9..1e3a7b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Intermediary output files temp/ -output/ +src/parenttext_pipeline/_version.py #credentials files credentials.json diff --git a/README.md b/README.md index aeacc9e..8540c54 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,15 @@ Handles the process for producing RapidPro flows from data held in spreadsheets. # Run -To start the pipeline: +Two [operations] are currently available: + +- `pull_data`: Read data from various sources and store them locally in json format. +- `create_flows`: Compile RapidPro flows from locally stored json files + +To start the pipeline performing both operations in sequence: ``` -python -m parenttext_pipeline.cli +python -m parenttext_pipeline.cli pull_data compile_flows ``` You will need to create a file called 'config.py', in the current working directory, and define a callable called 'create_config' that returns the pipeline settings as a dict. More details can be in the [configuration page][config]. @@ -27,6 +32,7 @@ You will need to create a file called 'config.py', in the current working direct - [Transcode tool] - to prepare video and audio files that may be used by ParentText applications +[operations]: docs/operations.md [config]: docs/configuration.md [Archive tool]: docs/archive.md [RapidPro flow importer]: docs/rapidpro-import.md diff --git a/docs/configuration.md b/docs/configuration.md index db76afe..69ac2bf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -15,160 +15,26 @@ The `create_config` callable must return a `dict` of configuration settings. # Available settings -## sources +The main features of the config are a list of steps of the pipeline, and a list of sources to pull data from. +Steps are executed in order, the first step producing a temporary flow output file, and subsequent steps generally operating on the output of the previous step and most of the time (but not always) producing a new (temporary) flow output files. Some steps may also produce different output artefacts than flows, such as a list of translatable strings, or logs or reports for QA. Subsequent steps cannot read such outputs, however. For more details about steps, see [steps]. +There are different types of steps, and some types of steps may need additional input data that is used to create or operate on the input flows. Such data is defined in data sources, which may reference local files or files off the internet, in various formats. Steps then may reference one or multiple such data sources. For more details about steps, see [sources]. -### sources.filename +The *pull_data* operation takes data referenced by all sources and saves it in the local file system (folder `{inputpath}`) converted to json. It is agnostic of the actual steps. -The name prefix that will be used in filenames during processing. +The *compile_flows* operation executes the sequence of steps and writes the output to `{flows_outputbasename}.json` in `{outputpath}`. -### sources.spreadsheet\_ids +The config has the following fields: -IDs of Google Sheets where the ParentText flows are defined. +- `meta`: meta information such as the pipeline version the config needs to be run with +- `inputpath`, `temppath` and `outputpath` (optional): Path to store/read input files, temp files, and output files. +- `flows_outputbasename`: Base filename of the output file and intermediate temp files. +- `output_split_number` (optional): Number of files to split the pipeline output (final flow definition) into. + - Used to divide the file at the final step to get it to a manageable size that can be uploaded to RapidPro. +- `steps`: A list of steps. For more details, see [steps] +- `sources`: A dictionary of data sources. For more details, see [sources] +- `sheet_names`: A dictionary of from sheet names to sheet_ids (**for Google sheets only**). + Sources can reference sheets by their ID or their sheet names. +- `parents`: **Not Implemented** One (or multiple?) parent repos whose sources can be referenced -### sources.crowdin\_name - -Name of the file that is produced to send to translators. - -### sources.tags - -Used to identify flows to be process. Possible values for tag 1: - -- onboarding -- dev\_assess -- ltp_activity -- home\_activity\_checkin -- module -- goal\_checkin -- safeguarding -- menu -- delivery - -### sources.split\_no - -The number of files into which the final flow definition will be split. - -Used to divide the file at the final step to get it to a manageable size that can be uploaded to RapidPro. - -## special\_expiration - -Used to modify expiration times. - -## default\_expiration - -Used to modify expiration times. - -## model - -Name of the Python module containing data models to use as part of the process of converting data extracted from sheets. - -## languages - -A list of language definitions that will be looked for to localize back into the flows. Each language definition consists of: - -- `language`: 3-letter language code used in RapidPro -- `code`: 2-letter code used in CrowdIn - -## translation\_repo - -Location of a git repository where translations are stored. - -## folder\_within\_repo - -The location within `tranlsation_repo` where translations are stored. - -Used in conjuction with `translation_repo`, above. - -## outputpath - -Destination path for all files (including intermediary files and log files). - -Default is 'output' within the current workin directory. - -## qr\_treatment - -How to process "quick replies". Valid values are: - -- move: Remove quick replies and add equivalents to them to the message text, and give numerical prompts to allow basic phone users to use the app. -- move_and_mod: As above but has additional functionality allowing you to replace phrases -- reformat: Reformat quick replies so that long ones are added to the message text, as above. -- reformat_whatsapp: Reformat quick replies to meet the whatsapp format -- reformat_china: Reformat quick replies to the standard as requested by China -- wechat: All quick replies moved to links in message text as can be used in WeChat -- none: Do nothing. - -## select\_phrases - -The default phrase we want to add if quick replies are being moved to message text. - -## add\_selectors - -If `qr_treatment` is 'move', add some basic numerical quick replies back in. Valid values are 'yes' or 'no'. - -## special\_words - -Path to a file containing words we always want to keep as full quick replies. - -## count\_threshold - -When `qr_treatment` is 'reformat', set limits on the number of quick replies that are processed. - -If the number of quick replies is below or equal to count\_threshold then the quick replies are left in place. - -## length\_threshold - -When `qr_treatment` is 'reformat', set limits on the number of quick replies that are processed. - -If the character-length of the longest quick reply is below or equal to length\_threshold then the quick replies are left in place. - -## ab\_testing\_sheet\_id - -Google Sheets ID for Sheet containing AB testing data. - -## localisation\_sheet\_id - -Google Sheets ID. - -## eng\_edits\_sheet\_id - -Google Sheets ID for Sheet containing dict edits data. - -## transl\_edits\_sheet\_id - -Google Sheets ID. - -## sg\_flow\_id - -Sheets ID for Sheet containing safeguarding data. - -## sg\_flow\_name - -The name of the RapidPro flow for safeguarding. - -## sg\_path - -Path to file containing translated safeguarding words in JSON format. - -## sg\_sources - -Defines a list of sources containing safeguarding keywords. Each entry is a `dict` containing the following keys: - -- `key`: three letter language code of the translated words -- `path`: file path on the local file system to the XLSX file containing the words - -For example: -```python -{ - "sg_sources": [ - { - "key": "spa", - "path": "excel_files/safeguarding mexico.xlsx", - }, - ], -} -``` - -The referenced XLSX files will be converted to a single file called _safeguarding\_words.json_, in the output directory. The `sg_path` setting will be overridden to point to this JSON file, for further processing. If `sg_sources` is not set, `sg_path` will remain unchanged. - -## redirect\_flow\_names - -Names of redirect flows to be modified as part of safeguarding process. +[sources]: sources.md +[steps]: steps.md diff --git a/docs/operations.md b/docs/operations.md new file mode 100644 index 0000000..ee8fa48 --- /dev/null +++ b/docs/operations.md @@ -0,0 +1,30 @@ +# Overview + +The pipeline tool supports different operations. To run the pipeline performing a sequence of operations: + +``` +python -m parenttext_pipeline.cli operation1 operation2 ... +``` + +In order to run a pipeline, you must have a configuration file, see [configuration page][config] for more details. + +Two operations are currently available: + +## `pull_data` + +Read data from various sources (which are defined in the config) and store them locally in json format. +The data will be written to the input folder specified in the config. +Different input formats are supported, and the data for each source is written to its own subfolder, see [sources]. + +The purpose of this is to a ensure that `compile_flows` runs of the pipeline are reproducable, by essentially freezing the state of all input spreadsheets at a point in time. It attempts to avoid the potential problem of Google Sheets being updated incorrectly and causing a pipeline run to fail. The `compile_flows` pipeline will only read locally stored data that has been pulled beforehand. + + +## `compile_flows` + +Compile RapidPro flows from locally stored json files that have been pulled using `pull_data`. +Compiling flows involves multiple processing steps that are defined in the config, see [steps]. + + +[config]: configuration.md +[steps]: steps.md +[sources]: sources.md \ No newline at end of file diff --git a/docs/sources.md b/docs/sources.md new file mode 100644 index 0000000..207b607 --- /dev/null +++ b/docs/sources.md @@ -0,0 +1,38 @@ +# Sources + +Sources represent references to input data that may be used by [steps] of the pipeline, in various possible *source formats*. + +- `sheets`: Model-agnostic spreadsheet workbooks (a *spreadsheet* or *workbook* is a collection of individual *sheets*). + - These may be in any of the following *subformats*: + - `google_sheets`: Reference to a Google spreadsheet + - `xlsx`: Reference to an XLSX file + - `csv`: Reference to a folder of csv files representing the workbook. + - `json`: Reference of a workbook in JSON format. + - Each input file is converted into JSON workbook format; the resulting files a flatly stored in the output folder. In case of a name clash, a later file will overwrite an earlier file. (Processing order is `files_list` > `files_dict`) +- `json`: JSON files. + - These are taken as is and copied to their new storage location. + - Currently, only local file paths are supported. +- `translation_repo`: a format specifically for the translation step, see `TranslationSourceConfig` in [configs]. +- `safeguarding`: a format specifically for the safeguarding step (to be deprecated), see `SafeguardingSourceConfig` in [configs]. +- Remark: We may introduce a model-specific spreadsheet format with a master sheet indicating the model underlying each sheet in the future, so that the data can be validated and stored in a json format representing the (possibly nested) model. + +Such data can be *pulled* to convert it into a github-friendly *storage format* (i.e. plaintext json) and store it locally. Once stored locally, such data can be used as input to individual steps of the *flow compilation* pipeline. The storage format is (so far) always json, and the exact structure of the json is domain specific, i.e. the user has to make sure that the data presented is in a format suitable for a specific pipeline step. In particular, it may be possible to represent input data in different *source formats* that yield the same data in the *storage format*. + +## File referencing + +The source config fully determines the storage location of the data in its *storage format*. All data is stored inside of `{config.inputpath}`. For each source, a subfolder `{source.id}` is created. + +### `json` and `sheets` + +Within the source's subfolder, for each `(name, filepath)` entry in `{source.files_dict}`, the processed version of `{filepath}` is stored as `{name}.json`. + +### `sheets` only + +For the input format `sheets`, we can additionally use `files_list`. + +- A special case here is if `files_archive` is provided and `source.subformat` is `csv`, then for each `sheet_id` entry in `source.files_list`, we process the folder `sheet_id` as a csv workbook and store the converted result as `{sheet_id}.json`. +- Otherwise, for each `sheet_id` entry in `source.files_list`, the processed version of `sheet_id` is stored as `{sheet_id}.json`. Note that this currently only works if `source.subformat` is `google_sheets`, because we have not made a decision on how to turn full file paths into filenames. +- Remark: Do we still need `files_archive` (`.zip` archive) support? I'd be keen to deprecate it. + +[configs]: ../src/parenttext_pipeline/configs.py +[steps]: steps.md \ No newline at end of file diff --git a/docs/steps.md b/docs/steps.md new file mode 100644 index 0000000..d2b321c --- /dev/null +++ b/docs/steps.md @@ -0,0 +1,54 @@ +# Pipeline steps + +Each step has an identifier (name), a type, and (depending on the type) may have a list of [sources] referencing input files relevant for the step. +Depending on the type, the config of each step may have various additional fields, see [configs] for details. The idenfier (name) and no further purpose and only serves for reporting and affects temp file names. Input file locations are determined by the sources. + +We have the following types of steps. + +- `create_flows`: Create flows from sheets (using `rpft create_flows`) + - source(s): type `sheets`, the input sheets to create the flows from + - `models_module`: Name of the Python module containing data models to use as part of the process of converting data extracted from sheets. + - `tags`: Tags to pass to `rpft create_flows`. Used to identify flows to be processed. Possible values for tag 1: + - `onboarding` + - `dev_assess` + - `ltp_activity` + - `home_activity_checkin` + - `module` + - `goal_checkin` + - `safeguarding` + - `menu` + - `delivery` +- `load_flows`: Load flows directly from json. + - source(s): type `json`, the source must reference exactly one input RapidPro json file (that the following steps operate on) +- `edits`: Apply edits and/or A/B-Testing to input flows (using repo `rapidpro_abtesting`) + - source(s): type `sheets`, the sheets defining the edits to do on the flows +- `extract_texts_for_translators`: Extract text from flows and produce a `.pot` file for translation. + - `crowdin_name`: base name of the output files +- `fix_arg_qr_translation`: ??? +- `has_any_word_check`: ??? +- `overall_integrity_check`: ??? +- `qr_treatment`: ... + - source: type `json`, the source's `files_dict` must have an entry `select_phrases_file` and `special_words_file` + - see `QRTreatmentStepConfig` in [configs] +- `safeguarding`: ... + - source(s): type `safeguarding`, files to read safeguarding data from + - see `SafeguardingStepConfig` in [configs] +- `translation`: Generate translated flows + - source(s): type `translation_repo`, repo to read translated strings from + - `languages`: List of languages to translate the flows into. Each language is a dict with two keys: + - `language` is the 3-letter code used in RapidPro + - `code` is the 2 letter code used in CrowdIn + languages: list[dict] +- `update_expiration_times`: Update expiration times of flows (using default value and an option file defining flow-specific values) + - source (optional): type `json`, the source's `files_dict` must have an entry `special_expiration_file` defining a map from flow names to expiration times + - `default_expiration_time`: expiration time to apply to all flows that are not referenced in `special_expiration_file` + +The first step of the pipeline must be `create_flows` or `load_flows`. These two steps do not take any input, and thus they also only make sense as a first step. + +### Remarks + +We want to have the functionality to pull Goals API data from a spreadsheet and store it locally, so it can be read by the API directly from github. +This does not require a step, but can be implemented by only specifying a `goals_api` source which is not referenced by any step. + +[configs]: ../src/parenttext_pipeline/configs.py +[sources]: sources.md \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f6cc773..75a420f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,15 @@ [build-system] -requires = ["setuptools"] +requires = ["setuptools", "setuptools-scm"] build-backend = "setuptools.build_meta" [project] name = "parenttext_pipeline" -version = "0.2.2" +dynamic = ["version"] authors = [ {name = "IDEMS International", email = "communications@idems.international"}, ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" license = {text = "GPL-3.0-or-later"} classifiers = [ "Development Status :: 4 - Beta", @@ -20,8 +20,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Topic :: Text Processing :: General", @@ -29,10 +27,14 @@ classifiers = [ ] dependencies = [ "beautifulsoup4~=4.12", - "rapidpro-abtesting@https://github.com/IDEMSInternational/rapidpro_abtesting/archive/refs/tags/0.1.1.tar.gz", + "packaging~=21.3", + "rapidpro-abtesting@https://github.com/IDEMSInternational/rapidpro_abtesting/archive/refs/tags/0.1.2.tar.gz", "requests~=2.31", - "rpft@https://github.com/IDEMSInternational/rapidpro-flow-toolkit/archive/refs/tags/1.1.3.tar.gz", + "rpft@https://github.com/IDEMSInternational/rapidpro-flow-toolkit/archive/refs/tags/1.2.1.tar.gz", ] [project.scripts] rpimport = "parenttext_pipeline.importer:cli" + +[tool.setuptools_scm] +version_file = "src/parenttext_pipeline/_version.py" diff --git a/src/parenttext_pipeline/__init__.py b/src/parenttext_pipeline/__init__.py index e69de29..0d77031 100644 --- a/src/parenttext_pipeline/__init__.py +++ b/src/parenttext_pipeline/__init__.py @@ -0,0 +1,7 @@ +def pipeline_version(): + try: + from parenttext_pipeline._version import version + except ModuleNotFoundError: + version = "dev" + + return version diff --git a/src/parenttext_pipeline/cli.py b/src/parenttext_pipeline/cli.py index 30dc158..25dca80 100644 --- a/src/parenttext_pipeline/cli.py +++ b/src/parenttext_pipeline/cli.py @@ -1,24 +1,48 @@ -import runpy +import argparse -from parenttext_pipeline.pipelines import Config, run +from packaging.version import Version +import parenttext_pipeline.compile_flows +import parenttext_pipeline.pull_data +from parenttext_pipeline import pipeline_version +from parenttext_pipeline.configs import load_config -class ConfigError(Exception): - pass +OPERATIONS_MAP = { + "pull_data": parenttext_pipeline.pull_data.run, + "compile_flows": parenttext_pipeline.compile_flows.run, +} def init(): - run(load_config()) - - -def load_config(): - create_config = runpy.run_path('config.py').get("create_config") - - if create_config and callable(create_config): - return Config(**create_config()) - else: - raise ConfigError("Could not find 'create_config' function in 'config.py'") - - -if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Run a pipeline of operations.") + parser.add_argument( + "operations", + nargs="+", + help=( + "Sequence of operations to perform. " + "Valid choices: pull_data, compile_flows." + ), + ) + args = parser.parse_args() + + config = load_config() + + config_pipeline_version = Version(config.meta["pipeline_version"]) + real_pipeline_version = Version(pipeline_version()) + if config_pipeline_version > real_pipeline_version: + raise ValueError( + f"Pipeline version of the config {config_pipeline_version} is newer " + f"than actual pipeline version {real_pipeline_version}" + ) + if config_pipeline_version.major != real_pipeline_version.major: + raise ValueError( + f"Major of config pipeline version {config_pipeline_version} does not " + f"match major of actual pipeline version {real_pipeline_version}" + ) + + for operation in args.operations: + OPERATIONS_MAP[operation](config) + + +if __name__ == "__main__": init() diff --git a/src/parenttext_pipeline/common.py b/src/parenttext_pipeline/common.py new file mode 100644 index 0000000..5d10111 --- /dev/null +++ b/src/parenttext_pipeline/common.py @@ -0,0 +1,121 @@ +import itertools +import json +import os +import shutil +import subprocess +from pathlib import Path + +from parenttext_pipeline import pipeline_version + + +def clear_or_create_folder(path): + if os.path.exists(path): + shutil.rmtree(path) + os.makedirs(path) + + +def get_input_folder(config, makedirs=False, in_temp=True): + if in_temp: + input_path = Path(config.temppath) / "input" + else: + input_path = Path(config.inputpath) + if makedirs: + os.makedirs(input_path, exist_ok=True) + return input_path + + +def get_input_subfolder(config, name, makedirs=False, in_temp=True): + source_input_path = get_input_folder(config, makedirs, in_temp) / name + if makedirs: + os.makedirs(source_input_path, exist_ok=True) + return source_input_path + + +def get_sheet_id(config, sheet_name): + return config.sheet_names.get(sheet_name, sheet_name) + + +def input_files_from_ids(step_input_path, spreadsheet_ids): + sheets = [ + os.path.join(step_input_path, f"{sheet_id}.json") + for sheet_id in spreadsheet_ids + ] + return sheets + + +def get_source_config(config, source_name, step_name): + source_config = config.sources.get(source_name) + if source_config is None: + raise ValueError(f"Step {step_name} references undefined source {source_name}") + return source_config + + +def get_files_from_source(config, source_name, step_name): + files_by_id = [] + source_config = get_source_config(config, source_name, step_name) + if source_config.format not in ["sheets", "json"]: + raise ValueError( + f"Source {source_name} referenced by step {step_name} should be " + "of format sheets or json but is not." + ) + step_input_path = get_input_subfolder(config, source_name) + # JSON input format currently doesn't support files_list + for file_id in itertools.chain( + getattr(source_config, "files_list", []), source_config.files_dict.keys() + ): + files_by_id.append((file_id, os.path.join(step_input_path, f"{file_id}.json"))) + return files_by_id + + +def get_files_list_from_source(config, source_name, step_name): + files_by_id = get_files_from_source(config, source_name, step_name) + return [pair[1] for pair in files_by_id] + + +def get_files_dict_from_source(config, source_name, step_name): + files_by_id = get_files_from_source(config, source_name, step_name) + return dict(files_by_id) + + +def get_full_step_files_list(config, step_config): + files = [] + if not step_config.sources: + raise ValueError(f"{step_config.id} step does not have any sources") + for source in step_config.sources: + files += get_files_list_from_source(config, source, step_config.id) + return files + + +def get_full_step_files_dict(config, step_config): + files = {} + if not step_config.sources: + raise ValueError(f"{step_config.id} step does not have any sources") + for source in step_config.sources: + files |= get_files_dict_from_source(config, source, step_config.id) + return files + + +def make_output_filepath(config, suffix): + return os.path.join( + config.temppath, + config.flows_outputbasename + suffix, + ) + + +def write_meta(config, field_dict, path): + meta = { + "pipeline_version": pipeline_version(), + "config_version": config.meta.get("version") or "legacy", + } | field_dict + + with open(Path(path) / "meta.json", "w") as outfile: + json.dump(meta, outfile, indent=2) + + +def read_meta(path): + with open(Path(path) / "meta.json") as infile: + return json.load(infile) + + +def run_node(script, *args): + subprocess.run(["node", "node_modules/@idems/" + script, *args]) diff --git a/src/parenttext_pipeline/compile_flows.py b/src/parenttext_pipeline/compile_flows.py new file mode 100644 index 0000000..5976a7d --- /dev/null +++ b/src/parenttext_pipeline/compile_flows.py @@ -0,0 +1,55 @@ +from parenttext_pipeline import steps +from parenttext_pipeline.common import ( + clear_or_create_folder, + get_input_folder, + read_meta, + write_meta, +) +from parenttext_pipeline.compile_sources import compile_sources + + +def run(config): + clear_or_create_folder(config.outputpath) + clear_or_create_folder(config.temppath) + + print("Compiling sources...") + config.sources = compile_sources(".", get_input_folder(config)) + + data = read_meta(config.inputpath) + meta = {"pull_timestamp": data["pull_timestamp"]} + write_meta(config, meta, config.outputpath) + + input_file = None + for step_num, step_config in enumerate(config.steps): + output_file = apply_step(config, step_config, step_num + 1, input_file) + print(f"Applied step {step_config.type}, result stored at {output_file}") + input_file = output_file + + steps.split_rapidpro_json(config, output_file) + print("Result written to output folder") + steps.write_diffable(config, output_file) + print("Diffable written to output folder") + + +STEP_MAPPING = { + "create_flows": steps.create_flows, + "load_flows": steps.load_flows, + "edits": steps.apply_edits, + "extract_texts_for_translators": steps.apply_extract_texts_for_translators, + "fix_arg_qr_translation": steps.apply_fix_arg_qr_translation, + "has_any_word_check": steps.apply_has_any_word_check, + "overall_integrity_check": steps.apply_overall_integrity_check, + "qr_treatment": steps.apply_qr_treatment, + "safeguarding": steps.apply_safeguarding, + "translation": steps.apply_translations, + "update_expiration_times": steps.update_expiration_times, +} + + +def apply_step(config, step_config, step_number, step_input_file): + step_type = step_config.type + function = STEP_MAPPING[step_type] + step_output_file = function(config, step_config, step_number, step_input_file) + if step_output_file is not None: + return step_output_file + return step_input_file diff --git a/src/parenttext_pipeline/compile_sources.py b/src/parenttext_pipeline/compile_sources.py new file mode 100644 index 0000000..a7bbcf7 --- /dev/null +++ b/src/parenttext_pipeline/compile_sources.py @@ -0,0 +1,77 @@ +import os +from pathlib import Path +import shutil +import tempfile + +from parenttext_pipeline.pull_data import unpack_archive +from parenttext_pipeline.configs import load_config + + +def compile_sources(repo_folder, destination_folder): + """ + Compile flattened sources such that parent content is included directly. + + For each source, a folder is created within destination_folder with source data, + and a source config is produced containing both the files from the parents + and the child's own files. + + Args: + repo_folder: local path to folder containing config + destination_folder: local path where compiled input files should be written + Returns: + A list of sources based on the sources in config.json in the repo_folder, + each source flattened so that parent content is included directly. + """ + + destination_folder = Path(destination_folder) + repo_folder = Path(repo_folder) + os.makedirs(destination_folder, exist_ok=True) + config = load_config(repo_folder) + parent_source_configs = {} + for parent_id, parent in config.parents.items(): + parent_destination_folder = destination_folder / parent_id + with tempfile.TemporaryDirectory() as temp_dir: + unpack_archive(temp_dir, parent.location) + # after extracting, all the stuff is inside a subfolder + # which we need to identify. + folder_contents = os.listdir(temp_dir) + assert len(folder_contents) == 1 + archive_content_folder = Path(temp_dir) / folder_contents[0] + parent_source_configs[parent_id] = compile_sources( + archive_content_folder, parent_destination_folder + ) + for source_id, source in config.sources.items(): + files_list = [] + files_dict = {} + for parent_source in source.parent_sources: + split = parent_source.split(".") + assert len(split) == 2 + parent_id, psource_id = split + psource = parent_source_configs[parent_id][psource_id] + # Merge in parent file lists/dicts and copy referenced input files + shutil.copytree( + destination_folder / parent_id / psource_id, + destination_folder / source_id, + dirs_exist_ok=True, + ) + for file in psource.files_list: + files_list.append(file) + for fileid, file in psource.files_dict.items(): + files_dict[fileid] = file + source.files_list = files_list + source.files_list + source.files_dict = files_dict | source.files_dict + source.parents = [] + shutil.copytree( + Path(repo_folder) / config.inputpath, destination_folder, dirs_exist_ok=True + ) + return config.sources + + +# def compile_input(repo_folder, destination_folder): +# - read config in repo_folder, checkout each parent into a temp folder +# - recursive call on each parent +# --> compiles each parent into a destination folder (within the temp?) +# --> each (parent) source -- when compiled -- corresponds to a folder of data +# - for each source: +# - copy compiled parent content into the destination child folder for each parent +# - copy the child content on top (from input/) diff --git a/src/parenttext_pipeline/config_converter.py b/src/parenttext_pipeline/config_converter.py new file mode 100644 index 0000000..8a4c702 --- /dev/null +++ b/src/parenttext_pipeline/config_converter.py @@ -0,0 +1,139 @@ +from parenttext_pipeline import pipeline_version + + +def convert_config(config): + return { + # I am assuming that the list of sources contains only one entry, + # as that is what I have observed in practice. + # In the original config, config["sources"][0]["crowdin_name"] + # specifies the output filename of the .pot file that is produced to + # uploaded to crowdin. This is ignored as the filename is hardcoded now. + "meta": { + "pipeline_version": pipeline_version(), + }, + "parents": {}, + "flows_outputbasename": config["sources"][0].get("filename"), + "output_split_number": config["sources"][0].get("split_no"), + "sources": { + "flow_definitions": { + "format": "sheets", + "subformat": "google_sheets", + "files_list": config["sources"][0].get("spreadsheet_ids"), + "files_archive": config["sources"][0].get("archive"), + }, + "edits_pretranslation": { + "format": "sheets", + "subformat": "google_sheets", + "files_list": [ + config.get(sheet_name) + for sheet_name in ["ab_testing_sheet_id", "localisation_sheet_id"] + if config.get(sheet_name) + ], + }, + "edits_posttranslation": { + "format": "sheets", + "subformat": "google_sheets", + "files_list": [ + config.get(sheet_name) + for sheet_name in ["transl_edits_sheet_id", "eng_edits_sheet_id"] + if config.get(sheet_name) + ], + }, + "translation": { + "format": "translation_repo", + "translation_repo": config.get("translation_repo"), + "folder_within_repo": config.get("folder_within_repo"), + "languages": config.get("languages"), + }, + "expiration_times": { + "format": "json", + "files_dict": { + "special_expiration_file": config.get("special_expiration"), + }, + }, + "qr_treatment": { + "format": "json", + "files_dict": { + "select_phrases_file": config.get("select_phrases"), + "special_words_file": config.get("special_words"), + }, + }, + "safeguarding": { + "format": "safeguarding", + "filepath": config.get("sg_path"), + "sources": config.get("sg_sources"), + }, + }, + "steps": [ + { + "id": "create_flows", + "type": "create_flows", + "sources": ["flow_definitions"], + "models_module": config.get("model"), + "tags": config["sources"][0].get("tags"), + }, + { + "id": "update_expiration_times", + "type": "update_expiration_times", + "sources": ["expiration_times"], + "default_expiration_time": config.get("default_expiration"), + }, + { + "id": "edits_pretranslation", + "type": "edits", + "sources": ["edits_pretranslation"], + }, + { + "id": "hasanyword_pretranslation", + "type": "has_any_word_check", + }, + { + "id": "overall_integrity_check_pretranslation", + "type": "overall_integrity_check", + }, + { + "id": "extract_texts_for_translators", + "type": "extract_texts_for_translators", + }, + { + "id": "translation", + "type": "translation", + "sources": ["translation"], + "languages": config.get("languages"), + }, + { + "id": "edits_posttranslation", + "type": "edits", + "sources": ["edits_posttranslation"], + }, + { + "id": "hasanyword_posttranslation", + "type": "has_any_word_check", + }, + { + "id": "fix_arg_qr_translation", + "type": "fix_arg_qr_translation", + }, + { + "id": "overall_integrity_check_posttranslation", + "type": "overall_integrity_check", + }, + { + "id": "qr_treatment", + "type": "qr_treatment", + "sources": ["qr_treatment"], + "qr_treatment": config.get("qr_treatment"), + "count_threshold": config.get("count_threshold"), + "length_threshold": config.get("length_threshold"), + "add_selectors": config.get("add_selectors"), + }, + { + "id": "safeguarding", + "type": "safeguarding", + "sources": ["safeguarding"], + "flow_uuid": config.get("sg_flow_id"), + "flow_name": config.get("sg_flow_name"), + "redirect_flow_names": config.get("redirect_flow_names"), + }, + ], + } diff --git a/src/parenttext_pipeline/configs.py b/src/parenttext_pipeline/configs.py new file mode 100644 index 0000000..a764f3e --- /dev/null +++ b/src/parenttext_pipeline/configs.py @@ -0,0 +1,278 @@ +from dataclasses import dataclass, field +import json +from pathlib import Path +import contextlib +import os +import runpy + +from parenttext_pipeline.config_converter import convert_config + + +@dataclass(kw_only=True) +class StepConfig: + # Identifier (name) of the step + id: str + # Type of the step, should be one of STEP_CONFIGS.keys() + # Make this an enum maybe + type: str + # A list of input data sources used by this step + sources: list = field(default_factory=list) + + +@dataclass(kw_only=True) +class CreateFlowsStepConfig(StepConfig): + # Name of the Python module containing data models describing the data sheets + models_module: str = None + # Tags for RPFT create_flows operation + tags: list + + +@dataclass(kw_only=True) +class SafeguardingStepConfig(StepConfig): + # Either (flow_id and flow_name) or redirect_flow_names has to be provided + + # The UUID of the RapidPro flow for safeguarding. + flow_uuid: str = None + # The name of the RapidPro flow for safeguarding. + flow_name: str = None + # A string representing a list of flow names o_O + # Names of redirect flows to be modified as part of safeguarding process. + redirect_flow_names: str + + +@dataclass(kw_only=True) +class UpdateExpirationStepConfig(StepConfig): + # Default flow expiration time + default_expiration_time: int + # sources: may reference a JSON-type source defining a file_dict containing + # the following key: `special_expiration_file`. + # This source file maps flow names to expiration times + + +@dataclass(kw_only=True) +class QRTreatmentStepConfig(StepConfig): + # str: how to process quick replies + # move: Remove quick replies and add equivalents to them to the message text, + # and give numerical prompts to allow basic phone users to use the app. + # move_and_mod: As above but has additional functionality allowing you + # to replace phrases + # reformat: Reformat quick replies so that long ones are added to the message text, + # as above. + # reformat_china: Reformat quick replies to the standard as requested by China + # wechat: All quick replies moved to links in message text as can be used in WeChat + qr_treatment: str + # ??? + qr_limit: int = 10 + # When qr_treatment is 'reformat', + # set limits on the number of quick replies that are processed. + # If the number of quick replies is below or equal to count_threshold + # then the quick replies are left in place. + count_threshold: str = None + # When qr_treatment is 'reformat', set limits on the number of quick replies + # that are processed. If the character-length of the longest quick reply is + # below or equal to length_threshold then the quick replies are left in place. + length_threshold: str = None + # If qr_treatment is 'move', add some basic numerical quick replies back in. + # Valid values are 'yes' or 'no'. + add_selectors: str = None + # Path to file with the default phrase (including translations) we want to add + # if quick replies are being moved to message text. + replace_phrases: str = "" + # sources: must reference a JSON-type source defining a file_dict containing the + # following keys: + # `select_phrases_file` and `special_words_file`. + # `select_phrases_file`: file with the default phrase (including translations) + # we want to add if quick replies are being moved to message text. + # `special_words_file`: file containing words (including translations) + # we always want to keep as full quick replies. + + +@dataclass(kw_only=True) +class TranslationStepConfig(StepConfig): + # Languages that will be looked for to localize back into the flows + # Should be a subset of the languages specified in the source. + # Each entry is a dict with two keys: + # "language" is the 3-letter code used in RapidPro + # "code" is the 2 letter code used in CrowdIn + languages: list[dict] + + +STEP_CONFIGS = { + "create_flows": CreateFlowsStepConfig, + "edits": StepConfig, + "translation": TranslationStepConfig, + "safeguarding": SafeguardingStepConfig, + "update_expiration_times": UpdateExpirationStepConfig, + "qr_treatment": QRTreatmentStepConfig, + "load_flows": StepConfig, + "extract_texts_for_translators": StepConfig, + "fix_arg_qr_translation": StepConfig, + "has_any_word_check": StepConfig, + "overall_integrity_check": StepConfig, +} + + +@dataclass(kw_only=True) +class ParentReference: + # URL of the repo/zip of the parent + location: str + + +@dataclass(kw_only=True) +class SourceConfig: + # Format of the source data + format: str + # References to parents sources to include in this source + parent_sources: list[str] = field(default_factory=list) + # For each `(name, filepath)` entry in `{files_dict}`, the processed version + # of `{filepath}` is stored as `{name}.json`. + files_dict: dict[str, str] = field(default_factory=dict) + # List of + files_list: list[str] = field(default_factory=list) + + +@dataclass(kw_only=True) +class SheetsSourceConfig(SourceConfig): + # Input format of the sheets. + # Either google_sheets, csv, json or xlsx + subformat: str + + # If files_archive is None: List of Google Sheet IDs to read from + # If files_archive is not None: List of folder names within archive + files_list: list[str] = field(default_factory=list) + # Path or URL to a zip archive containing folders + # each with sheets in CSV format (no nesting) + files_archive: str = None + + +@dataclass(kw_only=True) +class JSONSourceConfig(SourceConfig): + # For each `(name, filepath)` entry in `{files_dict}`, the processed version + # of `{filepath}` is stored as `{name}.json`. + # Redefined to make this required + files_dict: dict[str, str] + + +@dataclass(kw_only=True) +class SafeguardingSourceConfig(SourceConfig): + # Either filepath or sources has to be provided + + # Path to json file with safeguarding words + filepath: str = None + # List of XLSX files with safeguarding words + # Each source is a dict with two entries: + # Key: 3-letter language key + # path: path to an XLSX file containing safeguarding words + sources: list[dict[str, str]] = None + + def __post_init__(self): + if self.filepath is None and self.sources is None: + raise ValueError( + "For SafeguardingSourceConfig, either filepath " + "or sources needs to be provided" + ) + + +@dataclass(kw_only=True) +class TranslationSourceConfig(SourceConfig): + # Languages for which to pull the translation data. + # Each entry is a dict with two keys: + # "language" is the 3-letter code used in RapidPro + # "code" is the 2 letter code used in CrowdIn + languages: list[dict] + # Git repository (synched with crowdin) to read translation PO files from + translation_repo: str + # Folder within within the `translation_repo` repository to read + # translation PO files from + folder_within_repo: str + # Not Implemented: Commit hash or tag in the repo + # TODO: Offer branch, and then store the commit hash as part of + # the meta info about the output + commit_hash: str = None + commit_tag: str = None + + +SOURCE_CONFIGS = { + "sheets": SheetsSourceConfig, + "json": JSONSourceConfig, + "translation_repo": TranslationSourceConfig, + "safeguarding": SafeguardingSourceConfig, +} + + +@dataclass(kw_only=True) +class Config: + meta: dict + parents: dict[str, ParentReference] = field(default_factory=dict) + sheet_names: dict = field(default_factory=dict) + sources: dict[str, SourceConfig] + steps: list[StepConfig] = field(default_factory=list) + temppath: str = "temp" + outputpath: str = "output" + inputpath: str = "input" + flows_outputbasename: str + # Number of files to split the output into + output_split_number: int = 1 + + def __post_init__(self): + steps = [] + for step_config in self.steps: + step_type = step_config["type"] + step_config_class = STEP_CONFIGS.get(step_type) + if step_config_class is None: + raise ValueError(f"Unknown step type: {step_type}") + steps.append(step_config_class(**step_config)) + self.steps = steps + + sources = {} + for source_name, source_config in self.sources.items(): + source_format = source_config["format"] + source_config_class = SOURCE_CONFIGS.get(source_format) + if source_config_class is None: + raise ValueError(f"Unknown source type: {source_format}") + sources[source_name] = source_config_class(**source_config) + self.sources = sources + + parents = {} + for parent_name, parent_config in self.parents.items(): + parents[parent_name] = ParentReference(**parent_config) + self.parents = parents + + +class ConfigError(Exception): + pass + + +@contextlib.contextmanager +def change_cwd(new_cwd): + cwd = os.getcwd() + os.chdir(new_cwd) + + try: + yield + finally: + os.chdir(cwd) + + +def load_config(path="."): + try: + with open(Path(path) / "config.json") as f: + config = json.load(f) + return Config(**config) + except FileNotFoundError: + pass + + try: + with change_cwd(path): + create_config = runpy.run_path("config.py").get("create_config") + except FileNotFoundError: + raise ConfigError("Could not find 'config.json' nor 'config.py'") + + if create_config and callable(create_config): + config = create_config() + if "meta" not in config: + # Legacy version of config detected. Converting to new config format. + config = convert_config(config) + return Config(**config) + else: + raise ConfigError("Could not find 'create_config' function in 'config.py'") diff --git a/src/parenttext_pipeline/importer.py b/src/parenttext_pipeline/importer.py index 697d39f..b6bbe4a 100644 --- a/src/parenttext_pipeline/importer.py +++ b/src/parenttext_pipeline/importer.py @@ -75,10 +75,7 @@ def logout(session, host): def extract_csrf_token(session, url): - return BeautifulSoup( - session.get(url).text, - features="html.parser", - ).select_one( + return BeautifulSoup(session.get(url).text, features="html.parser",).select_one( "input[name=csrfmiddlewaretoken]" )["value"] diff --git a/src/parenttext_pipeline/pipelines.py b/src/parenttext_pipeline/pipelines.py deleted file mode 100644 index 879f603..0000000 --- a/src/parenttext_pipeline/pipelines.py +++ /dev/null @@ -1,550 +0,0 @@ -import json -import os -import requests -import shutil -import subprocess -import tempfile -from dataclasses import dataclass -from pathlib import Path - -from rpft.converters import create_flows -from rpft.logger.logger import initialize_main_logger -from rapidpro_abtesting.main import apply_abtests - -from parenttext_pipeline.steps import update_expiration_time, split_rapidpro_json -from parenttext_pipeline.extract_keywords import process_keywords_to_file - - -@dataclass(kw_only=True) -class Config: - sources: list - special_expiration: str - default_expiration: int - model: str - languages: list - translation_repo: str - folder_within_repo: str - outputpath: str = "output" - qr_treatment: str - select_phrases: str - replace_phrases: str = "" - add_selectors: str - special_words: str - count_threshold: int - length_threshold: int - qr_limit: int = 10 - ab_testing_sheet_id: str = "" - localisation_sheet_id: str = "" - eng_edits_sheet_id: str = "" - transl_edits_sheet_id: str = "" - sg_flow_id: str = "" - sg_flow_name: str = "" - sg_path: str = "" - sg_sources: list = None - redirect_flow_names: str = "" - - -def run(config: Config): - outputpath = config.outputpath - - if not os.path.exists(outputpath): - os.makedirs(outputpath) - - ##################################################################### - # Step 0: Fetch available PO files and convert to JSON - ##################################################################### - - for lang in config.languages: - lang_name = lang["language"] - lang_code = lang["code"] - translations_store_folder = Path(outputpath) / f"{lang_name}_translations" - - if os.path.exists(translations_store_folder): - shutil.rmtree(translations_store_folder) - - os.makedirs(translations_store_folder) - - # Download relevant translation files from github - language_folder_in_repo = config.folder_within_repo + "/" + lang_code - raw_translation_store = os.path.join(translations_store_folder, "raw_po_files") - download_translations_github( - config.translation_repo, - language_folder_in_repo, - raw_translation_store, - ) - - for root, dirs, files in os.walk(raw_translation_store): - for file in files: - file_name = Path(file).stem - source_file_path = os.path.join(root, file) - dest_file_path = os.path.join( - translations_store_folder, - file_name + ".json", - ) - run_node( - "idems_translation_common/index.js", - "convert", - source_file_path, - dest_file_path, - ) - - # Merge all translation files into a single JSON that we can localise back into - # our flows - run_node( - "idems_translation_common/index.js", - "concatenate_json", - translations_store_folder, - translations_store_folder, - "merged_translations.json", - ) - - print("Step 0 complete, fetched all available translations and converted to json") - - initialize_main_logger(Path(outputpath) / "rpft.log") - - for source in config.sources: - source_file_name = source["filename"] - crowdin_file_name = source["crowdin_name"] - - ##################################################################### - # Step 1: Load google sheets and convert to RapidPro JSON - ##################################################################### - - archive_fp = download_archive(config, source) - input_path_1_2 = load_sheets(config, source, archive_fp) - input_path_2 = update_expiration_time(config, source, input_path_1_2) - - print("Step 1 complete") - - ##################################################################### - # Step 2: Flow edits (for all deployments) and localization (changes specific to - # a deployment) - ##################################################################### - - log_file_path = os.path.join(outputpath, "2_ab_testing.log") - ab_testing_sheet_id = config.ab_testing_sheet_id - localisation_sheet_id = config.localisation_sheet_id - - if ab_testing_sheet_id or localisation_sheet_id: - output_file_name_2 = source_file_name + "_2_flow_edits" - output_path_2 = os.path.join(outputpath, output_file_name_2 + ".json") - - input_sheets = [ - sheet - for sheet in [config.ab_testing_sheet_id, config.localisation_sheet_id] - if sheet - ] - - apply_abtests( - input_path_2, - output_path_2, - input_sheets, - "google_sheets", - log_file_path, - ) - print("Step 2 complete, added A/B tests and localization") - else: - output_path_2 = input_path_2 - print("Step 2 skipped, no AB testing sheet ID provided") - - #################################################################### - # Step 3: Catch errors pre-translation - #################################################################### - - input_path_3_1 = output_path_2 - output_file_name_3_1 = source_file_name + "_3_1_has_any_word_check" - has_any_words_log = "3_has_any_words_check" - - run_node( - "idems_translation_chatbot/index.js", - "has_any_words_check", - input_path_3_1, - outputpath, - output_file_name_3_1, - has_any_words_log, - ) - - input_path_3_2 = os.path.join(outputpath, output_file_name_3_1 + ".json") - integrity_log = "3_integrity_log" - excel_log_name = os.path.join(outputpath, "3_excel_log.xlsx") - - run_node( - "idems_translation_chatbot/index.js", - "overall_integrity_check", - input_path_3_2, - outputpath, - integrity_log, - excel_log_name, - ) - - print("Step 3 complete, reviewed files pre-translation") - - ##################################################################### - # Step 4: Extract Text to send to translators - ##################################################################### - - input_path_4_1 = input_path_3_2 - output_file_name_4_1 = source_file_name + "_4_english_for_translation" - - run_node( - "idems_translation_chatbot/index.js", - "extract_simple", - input_path_4_1, - outputpath, - output_file_name_4_1, - ) - - translator_folder = os.path.join(outputpath, "send_to_translators") - - # Setup output file to send to translators if it doesn't exist - if not os.path.exists(translator_folder): - os.makedirs(translator_folder) - - input_path_4_2 = os.path.join(outputpath, output_file_name_4_1 + ".json") - output_path_4_2 = os.path.join(translator_folder, crowdin_file_name + ".pot") - - run_node( - "idems_translation_common/index.js", - "convert", - input_path_4_2, - output_path_4_2, - ) - - print("Step 4 complete, extracted text for translation") - - ##################################################################### - # Step 5: Localise translations back into JSON files - ##################################################################### - - input_path_5 = input_path_3_2 - output_file_name_5 = source_file_name + "_5_localised_translations" - - for lang in config.languages: - language = lang["language"] - - json_translation_path = os.path.join( - outputpath, language + "_translations", "merged_translations.json" - ) - - run_node( - "idems_translation_chatbot/index.js", - "localize", - input_path_5, - json_translation_path, - language, - output_file_name_5, - outputpath, - ) - - input_path_5 = os.path.join(outputpath, output_file_name_5 + ".json") - - print("Step 5 complete, localised translations back into JSON") - - ##################################################################### - # step 6: post translation edits - ##################################################################### - - input_path_6 = os.path.join(outputpath, output_file_name_5 + ".json") - log_file_path = os.path.join(outputpath, "6_dict_edits.log") - - input_edit_sheets = [ - sheet - for sheet in [config.eng_edits_sheet_id, config.transl_edits_sheet_id] - if sheet - ] - - if input_edit_sheets: - output_file_name_6 = source_file_name + "_6_dict_edits" - output_path_6 = os.path.join(outputpath, output_file_name_6 + ".json") - - apply_abtests( - input_path_6, - output_path_6, - input_edit_sheets, - "google_sheets", - log_file_path, - ) - print("Step 6 complete, text & translation edits made for dictionaries") - else: - output_path_6 = input_path_6 - print("Step 6 skipped, no dict edits sheet ID provided") - - ##################################################################### - # step 7: catch errors post translation - ##################################################################### - - input_path_7_1 = output_path_6 - output_file_name_7_1 = source_file_name + "_7_1_has_any_word_check" - has_any_words_log = "7_has_any_words_check" - - run_node( - "idems_translation_chatbot/index.js", - "has_any_words_check", - input_path_7_1, - outputpath, - output_file_name_7_1, - has_any_words_log, - ) - - input_path_7_2 = os.path.join(outputpath, output_file_name_7_1 + ".json") - output_file_name_7_2 = source_file_name + "_7_2_fix_arg_qr_translation" - fix_arg_qr_log = "7_arg_qr_log" - - run_node( - "idems_translation_chatbot/index.js", - "fix_arg_qr_translation", - input_path_7_2, - outputpath, - output_file_name_7_2, - fix_arg_qr_log, - ) - - input_path_7_3 = os.path.join(outputpath, output_file_name_7_2 + ".json") - integrity_log = "7_integrity_log" - excel_log_name = os.path.join(outputpath, "8_excel_log.xlsx") - - run_node( - "idems_translation_chatbot/index.js", - "overall_integrity_check", - input_path_7_3, - outputpath, - integrity_log, - excel_log_name, - ) - - print("Step 7 complete, reviewed files post translation") - - ##################################################################### - # step 8: add quick replies to message text and translation - ##################################################################### - - input_path_8 = os.path.join(outputpath, output_file_name_7_2 + ".json") - output_file_name_8 = source_file_name + "_8_modify_QR" - - # We can do different things to our quick replies depending on the deployment - # channel - if config.qr_treatment == "move": - run_node( - "idems_translation_chatbot/index.js", - "move_quick_replies", - input_path_8, - config.select_phrases, - output_file_name_8, - outputpath, - config.add_selectors, - str(config.qr_limit), - config.special_words, - ) - output_path_8 = os.path.join(outputpath, output_file_name_8 + ".json") - print("Step 8 complete, removed quick replies") - elif config.qr_treatment == "move_and_mod": - run_node( - "idems_translation_chatbot/index.js", - "move_and_mod_quick_replies", - input_path_8, - config.select_phrases, - config.replace_phrases, - output_file_name_8, - outputpath, - config.add_selectors, - str(config.qr_limit), - config.special_words, - ) - output_path_8 = os.path.join(outputpath, output_file_name_8 + ".json") - print("Step 8 complete, removed and modified quick replies") - elif config.qr_treatment == "reformat": - run_node( - "idems_translation_chatbot/index.js", - "reformat_quick_replies", - input_path_8, - config.select_phrases, - output_file_name_8, - outputpath, - config.count_threshold, - config.length_threshold, - str(config.qr_limit), - config.special_words, - ) - output_path_8 = os.path.join(outputpath, output_file_name_8 + ".json") - print("Step 8 complete, reformatted quick replies") - elif config.qr_treatment == "reformat_whatsapp": - run_node( - "idems_translation_chatbot/index.js", - "reformat_quick_replies_whatsapp", - input_path_8, - config.select_phrases, - output_file_name_8, - outputpath, - str(config.qr_limit), - config.special_words, - ) - output_path_8 = os.path.join(outputpath, output_file_name_8 + ".json") - print("Step 8 complete, reformatted quick replies to WhatsApp standard") - elif config.qr_treatment == "reformat_china": - run_node( - "idems_translation_chatbot/index.js", - "reformat_quick_replies_china", - input_path_8, - config.select_phrases, - output_file_name_8, - outputpath, - config.count_threshold, - config.length_threshold, - str(config.qr_limit), - config.special_words, - ) - output_path_8 = os.path.join(outputpath, output_file_name_8 + ".json") - print("Step 8 complete, reformatted quick replies to China standard") - elif config.qr_treatment == "wechat": - run_node( - "idems_translation_chatbot/index.js", - "convert_qr_to_html", - input_path_8, - output_file_name_8, - outputpath - ) - output_path_8 = os.path.join(outputpath, output_file_name_8 + ".json") - print("Step 8 complete, moved quick replies to html") - else: - output_path_8 = input_path_8 - print("Step 8 skipped, no QR edits specified") - - ##################################################################### - # step 9: implement safeguarding - ##################################################################### - - input_path_9a = output_path_8 - input_path_9b = None - output_path_9 = os.path.join( - outputpath, - source_file_name + "_9_safeguarding.json", - ) - - if config.sg_sources: - config.sg_path = Path(config.outputpath) / "safeguarding_words.json" - process_keywords_to_file(config.sg_sources, config.sg_path) - - if (config.sg_path and config.sg_flow_name and config.sg_flow_id): - run_node( - "safeguarding-rapidpro/v2_add_safeguarding_to_flows.js", - input_path_9a, - str(config.sg_path), - output_path_9, - config.sg_flow_id, - config.sg_flow_name, - ) - input_path_9b = output_path_9 - print("Safeguarding flows added") - - if config.sg_path and config.redirect_flow_names: - run_node( - "safeguarding-rapidpro/v2_edit_redirect_flow.js", - input_path_9b or input_path_9a, - str(config.sg_path), - output_path_9, - config.redirect_flow_names, - ) - print("Redirect flows edited") - - print("Step 9 completed") - - ##################################################################### - # step 10. split files (if too big)? - ##################################################################### - - split_rapidpro_json(config, source, output_path_9) - print("Step 10 completed") - - -def download_archive(config, source): - location = source.get("archive") - archive_fp = os.path.join(config.outputpath, source["filename"] + ".zip") - - if location and location.startswith("http"): - response = requests.get(location) - - if response.ok: - with open(archive_fp, "wb") as archive: - archive.write(response.content) - print(f"Archive downloaded, url={location}, file={archive_fp}") - else: - print( - f"Archive download failed, " - f"status={response.status_code}, url={location}" - ) - - return archive_fp - else: - return location - - -def load_sheets(config, source, archive_fp): - output_path = os.path.join( - config.outputpath, - source["filename"] + "_1_1_load_from_sheets.json", - ) - spreadsheet_ids = source["spreadsheet_ids"] - tags = source["tags"] - - if archive_fp: - with tempfile.TemporaryDirectory() as temp_dir: - shutil.unpack_archive(archive_fp, temp_dir) - flows = create_flows( - [ - os.path.join(temp_dir, spreadsheet_id) - for spreadsheet_id in spreadsheet_ids - ], - None, - "csv", - data_models=config.model, - tags=tags, - ) - else: - flows = create_flows( - spreadsheet_ids, - None, - "google_sheets", - data_models=config.model, - tags=tags, - ) - - with open(output_path, "w") as export: - json.dump(flows, export, indent=4) - - print(f"RapidPro flows created, file={output_path}") - - return output_path - - -def download_translations_github(repo_url, folder_path, local_folder): - # Parse the repository URL to get the owner and repo name - parts = repo_url.split("/") - owner = parts[-2] - repo_name = parts[-1].split(".")[0] # Remove '.git' extension if present - - # Construct the GitHub API URL to get the contents of the folder - api_url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{folder_path}" - - try: - response = requests.get(api_url) - response.raise_for_status() - - if not os.path.exists(local_folder): - os.makedirs(local_folder) - - for item in response.json(): - local_file_path = Path(local_folder) / item["name"] - - if item["type"] == "file" and local_file_path.suffix == ".po": - response = requests.get(item["download_url"]) - response.raise_for_status() - - with open(local_file_path, "wb") as local_file: - local_file.write(response.content) - - except Exception as e: - print("An error occurred:", e) - - -def run_node(script, *args): - subprocess.run(["node", "node_modules/@idems/" + script, *args]) diff --git a/src/parenttext_pipeline/pull_data.py b/src/parenttext_pipeline/pull_data.py new file mode 100644 index 0000000..54947ec --- /dev/null +++ b/src/parenttext_pipeline/pull_data.py @@ -0,0 +1,196 @@ +import os +import shutil +import tempfile +from datetime import datetime, timezone +from pathlib import Path + +import requests +from rpft.converters import convert_to_json + +from parenttext_pipeline.common import ( + clear_or_create_folder, + get_input_subfolder, + get_sheet_id, + run_node, + write_meta, +) +from parenttext_pipeline.extract_keywords import process_keywords_to_file + + +def run(config): + clear_or_create_folder(config.inputpath) + clear_or_create_folder(config.temppath) + + for name, source in config.sources.items(): + if source.format == "sheets": + pull_sheets(config, source, name) + elif source.format == "json": + pull_json(config, source, name) + elif source.format == "translation_repo": + pull_translations(config, source, name) + elif source.format == "safeguarding": + pull_safeguarding(config, source, name) + else: + raise ValueError(f"Invalid source format {source.format}") + + print(f"Pulled all {name} data") + + meta = { + "pull_timestamp": str(datetime.now(timezone.utc)), + } + write_meta(config, meta, config.inputpath) + + print("DONE.") + + +def pull_translations(config, source, source_name): + for lang in source.languages: + lang_code = lang["code"] + translations_input_folder = Path(config.inputpath) / source_name / lang_code + translations_temp_folder = Path(config.temppath) / source_name / lang_code + + if os.path.exists(translations_input_folder): + shutil.rmtree(translations_input_folder) + + os.makedirs(translations_input_folder) + os.makedirs(translations_temp_folder) + + # Download relevant PO translation files from github to temp folder + language_folder_in_repo = source.folder_within_repo + "/" + lang_code + translation_temp_po_folder = os.path.join( + translations_temp_folder, "raw_po_files" + ) + download_translations_github( + source.translation_repo, + language_folder_in_repo, + translation_temp_po_folder, + ) + + # Convert PO to json and write these to input folder + for root, dirs, files in os.walk(translation_temp_po_folder): + for file in files: + file_name = Path(file).stem + source_file_path = os.path.join(root, file) + dest_file_path = os.path.join( + translations_input_folder, + file_name + ".json", + ) + run_node( + "idems_translation_common/index.js", + "convert", + source_file_path, + dest_file_path, + ) + + +def pull_sheets(config, source, source_name): + # Download all sheets used for flow creation and edits and store as json + source_input_path = get_input_subfolder( + config, source_name, makedirs=True, in_temp=False + ) + + jsons = {} + if source.files_archive is not None: + if source.subformat != "csv": + raise NotImplementedError( + "files_archive only supported for sheets of subformat csv." + ) + location = source.archive + archive_filepath = download_archive(config.temppath, location) + with tempfile.TemporaryDirectory() as temp_dir: + shutil.unpack_archive(archive_filepath, temp_dir) + for sheet_id in source.files_list: + csv_folder = os.path.join(temp_dir, sheet_id) + jsons[sheet_id] = convert_to_json([csv_folder], source.subformat) + else: + for sheet_name in source.files_list: + if source.subformat != "google_sheets": + raise NotImplementedError( + "files_list only supported for sheets of subformat google_sheets." + ) + sheet_id = get_sheet_id(config, sheet_name) + jsons[sheet_name] = convert_to_json(sheet_id, source.subformat) + for new_name, sheet_id in source.files_dict.items(): + jsons[new_name] = convert_to_json(sheet_id, source.subformat) + + for sheet_name, content in jsons.items(): + with open(source_input_path / f"{sheet_name}.json", "w", encoding='utf-8') as export: + export.write(content) + + +def pull_json(config, source, source_name): + # Postprocessing files + source_input_path = get_input_subfolder( + config, source_name, makedirs=True, in_temp=False + ) + + for new_name, filepath in source.files_dict.items(): + shutil.copyfile(filepath, source_input_path / f"{new_name}.json") + + +def pull_safeguarding(config, source, source_name): + # Safeguarding files + source_input_path = get_input_subfolder( + config, source_name, makedirs=True, in_temp=False + ) + safeguarding_file_path = source_input_path / "safeguarding_words.json" + if source.sources: + process_keywords_to_file(source.sources, safeguarding_file_path) + else: + shutil.copyfile(source.filepath, safeguarding_file_path) + + +def unpack_archive(destination, location): + with tempfile.TemporaryDirectory() as temp_dir: + location = download_archive(temp_dir, location) + shutil.unpack_archive(location, destination) + + +def download_archive(destination, location): + if location and location.startswith("http"): + response = requests.get(location) + + if response.ok: + archive_destinationpath = os.path.join(destination, "archive.zip") + with open(archive_destinationpath, "wb") as archive: + archive.write(response.content) + print(f"Archive downloaded, url={location}, file={archive_destinationpath}") + else: + print( + f"Archive download failed, " + f"status={response.status_code}, url={location}" + ) + + return archive_destinationpath + else: + return location + + +def download_translations_github(repo_url, folder_path, local_folder): + # Parse the repository URL to get the owner and repo name + parts = repo_url.split("/") + owner = parts[-2] + repo_name = parts[-1].split(".")[0] # Remove '.git' extension if present + + # Construct the GitHub API URL to get the contents of the folder + api_url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{folder_path}" + + try: + response = requests.get(api_url) + response.raise_for_status() + + if not os.path.exists(local_folder): + os.makedirs(local_folder) + + for item in response.json(): + local_file_path = Path(local_folder) / item["name"] + + if item["type"] == "file" and local_file_path.suffix == ".po": + response = requests.get(item["download_url"]) + response.raise_for_status() + + with open(local_file_path, "wb") as local_file: + local_file.write(response.content) + + except Exception as e: + print("An error occurred:", e) diff --git a/src/parenttext_pipeline/safeguarding_words.json b/src/parenttext_pipeline/safeguarding_words.json deleted file mode 100644 index 3416487..0000000 --- a/src/parenttext_pipeline/safeguarding_words.json +++ /dev/null @@ -1,2445 +0,0 @@ -{ - "generic": [ - { - "English": { - "keywords": [ - "kidnap", - "kidnapping", - "kidnapper", - "kidnapped" - ], - "mispellings": [ - "kiddnapp", - "kidnapp", - "kiddnap", - "kidnape" - ] - }, - "Translation": { - "keywords": [ - "kutfumba", - "kutfumba", - "umtfumbi", - "lotfunjiwe" - ], - "mispellings": [ - "ftumba", - "tfunba", - "tfunnba", - "ukutfumbe" - ] - } - }, - { - "English": { - "keywords": [ - "death", - "dead", - "dying" - ], - "mispellings": [ - "deeath", - "deat" - ] - }, - "Translation": { - "keywords": [ - "kufa", - "lofile", - "lofako" - ], - "mispellings": [ - "khufa", - "kuf" - ] - } - }, - { - "English": { - "keywords": [ - "illegal" - ], - "mispellings": [ - "ilegal", - "ellegal", - "ileagaly" - ] - }, - "Translation": { - "keywords": [ - "lokungekho emtsetfweni" - ], - "mispellings": [ - "kungeko emstetfweni", - "ngekho ensteftweni", - "ngalokungeko emstetfeni" - ] - } - }, - { - "English": { - "keywords": [ - "poison", - "poisoning", - "poisoned" - ], - "mispellings": [ - "pesian", - "boisoned", - "poision", - "poisioning", - "poisioned" - ] - }, - "Translation": { - "keywords": [ - "shevu", - "kufakela shevu", - "lofakelwe shevu" - ], - "mispellings": [ - "shevi", - "lofkelwe shevu", - "hsevu", - "kufakala shebu", - "kufakwe shevu" - ] - } - }, - { - "English": { - "keywords": [ - "crisis", - "crises" - ], - "mispellings": [ - "krisis", - "krises" - ] - }, - "Translation": { - "keywords": [ - "simo lesibucayi", - "simo lesibucayi" - ], - "mispellings": [ - "sumo lesibucayi", - "simu lesicayi" - ] - } - }, - { - "English": { - "keywords": [ - "trouble", - "troubling", - "troublesome", - "troubled" - ], - "mispellings": [ - "troubl", - "trevel", - "trubl" - ] - }, - "Translation": { - "keywords": [ - "inhlupheko", - "kuhlupha", - "lohluphako", - "lohlushiwe" - ], - "mispellings": [ - "inkenga", - "inkengs", - "inhlup" - ] - } - }, - { - "English": { - "keywords": [ - "victim", - "victimised" - ], - "mispellings": [ - "victm", - "victimized", - "victem" - ] - }, - "Translation": { - "keywords": [ - "lohlushiwe", - "lohlushiwe" - ], - "mispellings": [ - "lokhushiwe", - "lohlshiwe", - "lohleshiwe" - ] - } - }, - { - "English": { - "keywords": [ - "emergency", - "911.0" - ], - "mispellings": [ - "emergincy", - "emergincey", - "emergince" - ] - }, - "Translation": { - "keywords": [ - "simo lesiphutfumako" - ], - "mispellings": [ - "simo lsiphutfumako", - "simo lesihpuftumako", - "isimo lesiphutfu" - ] - } - }, - { - "English": { - "keywords": [ - "danger", - "dangerous", - "endangered" - ], - "mispellings": [ - "dangeur", - "dangeer", - "dangre" - ] - }, - "Translation": { - "keywords": [ - "ingoti", - "kuyingoti", - "engotini" - ], - "mispellings": [ - "inggoti", - "kuyinguti", - "ingooti" - ] - } - }, - { - "English": { - "keywords": [ - "stalking" - ], - "mispellings": [ - "staling", - "staking", - "stakling", - "stalkin" - ] - }, - "Translation": { - "keywords": [ - "kutuma" - ], - "mispellings": [ - "ktuma", - "kutum", - "kutumu", - "kutm" - ] - } - }, - { - "English": { - "keywords": [ - "follow", - "following", - "followed" - ], - "mispellings": [ - "folow", - "folowed", - "followin", - "folowin" - ] - }, - "Translation": { - "keywords": [ - "kulandzela", - "lanzelayo", - "kulandzelwe" - ], - "mispellings": [ - "landxelwa", - "landzelws", - "kulandzel", - "kulanzel" - ] - } - }, - { - "English": { - "keywords": [ - "suffer", - "suffering", - "suffered" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "hlupheka", - "kuhlupheka", - "hluphekile" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "hide", - "hiding", - "hid" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "fihla", - "kufihla", - "fihlile" - ], - "mispellings": [] - } - } - ], - "health": [ - { - "English": { - "keywords": [ - "vomit", - "vomitted", - "vomitting" - ], - "mispellings": [ - "yak", - "vomited", - "womit" - ] - }, - "Translation": { - "keywords": [ - "hlanta", - "hlantile", - "lohlantako" - ], - "mispellings": [ - "hhlanta", - "hlantle", - "hatla" - ] - } - }, - { - "English": { - "keywords": [ - "puke", - "puking", - "puked" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "hlanta", - "kuhlanta", - "lohlantile" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "disease", - "STD", - "diseases" - ], - "mispellings": [ - "diseaze", - "dizeese", - "SDT" - ] - }, - "Translation": { - "keywords": [ - "sifo", - "STD", - "tifo" - ], - "mispellings": [ - "ssifo", - "sigo", - "SDT" - ] - } - }, - { - "English": { - "keywords": [ - "sick", - "sickness", - "sickly" - ], - "mispellings": [ - "sik", - "seck" - ] - }, - "Translation": { - "keywords": [ - "gula", - "kugula", - "butsakatsaka" - ], - "mispellings": [ - "guls", - "gulla" - ] - } - }, - { - "English": { - "keywords": [ - "ill", - "illness", - "illnesses" - ], - "mispellings": [ - "illl", - "illniss", - "illnees" - ] - }, - "Translation": { - "keywords": [ - "gula", - "kugula", - "tifo" - ], - "mispellings": [ - "fula", - "kuggula", - "isiffoo" - ] - } - }, - { - "English": { - "keywords": [ - "health", - "healthy" - ], - "mispellings": [ - "haelth", - "healthy", - "helath" - ] - }, - "Translation": { - "keywords": [ - "imphilo", - "lophilile" - ], - "mispellings": [ - "immphilo", - "pilile", - "temhpilo" - ] - } - }, - { - "English": { - "keywords": [ - "hospital", - "clinic" - ], - "mispellings": [ - "hopital", - "hopsital" - ] - }, - "Translation": { - "keywords": [ - "sibhedlela", - "umtfolamphilo" - ], - "mispellings": [ - "sihbedlela", - "sibhhedlela" - ] - } - }, - { - "English": { - "keywords": [ - "miscarriage", - "miscarry" - ], - "mispellings": [ - "miscariage", - "miscarig" - ] - }, - "Translation": { - "keywords": [ - "kuphunyelwa sisu", - "kuphunyelwa sisu" - ], - "mispellings": [ - "kuphhunyelwa ssu", - "kupunyelwa siso" - ] - } - }, - { - "English": { - "keywords": [ - "pregnant", - "pregnancy", - "expecting" - ], - "mispellings": [ - "pegant", - "preganci", - "prengant" - ] - }, - "Translation": { - "keywords": [ - "ukhulelwe", - "kukhulelwa", - "kukhulelwa" - ], - "mispellings": [ - "kulela", - "kuhkulelwa", - "kukukhulelwa" - ] - } - }, - { - "English": { - "keywords": [ - "abortion", - "abort", - "aborted", - "aborting" - ], - "mispellings": [ - "abbortion", - "obortion" - ] - }, - "Translation": { - "keywords": [ - "kukhishwa kwesisu", - "kukhipha sisu", - "kukhishwe sisu" - ], - "mispellings": [ - "kukhhipa sisu", - "kukhiipha" - ] - } - }, - { - "English": { - "keywords": [ - "COVID", - "COVID19" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "i-COVID", - "i-COVID19" - ], - "mispellings": [] - } - } - ], - "mental_health": [ - { - "English": { - "keywords": [ - "anxiety", - "anxious", - "anxiousness" - ], - "mispellings": [ - "angziaty", - "anxieti", - "anxeity" - ] - }, - "Translation": { - "keywords": [ - "luvalo", - "uneluvalo", - "kuba neluvalo" - ], - "mispellings": [ - "lubalo", - "luvallo", - "luvelo" - ] - } - }, - { - "English": { - "keywords": [ - "depression", - "depressive", - "depressed" - ], - "mispellings": [ - "despression", - "despressive", - "despressed" - ] - }, - "Translation": { - "keywords": [ - "kukhatsateka", - "lokukhatsatako", - "khatsatekile" - ], - "mispellings": [ - "kukhastata", - "khatseteka", - "kukkhatsateka" - ] - } - }, - { - "English": { - "keywords": [ - "disorder", - "disorders" - ], - "mispellings": [ - "dizorder", - "dissorder", - "isorder" - ] - }, - "Translation": { - "keywords": [ - "simo lesingalungi", - "timo letingalungi" - ], - "mispellings": [ - "simo llesingalugni", - "simo llesingalungi", - "imo lesingalungi" - ] - } - }, - { - "English": { - "keywords": [ - "therapy" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "kwelashwa" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "psychologist" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "dokotela wengcondvo" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "suicide", - "suicidal", - "suicides" - ], - "mispellings": [ - "suicido", - "suiced" - ] - }, - "Translation": { - "keywords": [ - "kutibulala", - "ukutibulala", - "ukutibulala" - ], - "mispellings": [ - "kutibhulala", - "kubulula" - ] - } - }, - { - "English": { - "keywords": [ - "stress", - "stressed" - ], - "mispellings": [ - "stresss", - "strees", - "srtess" - ] - }, - "Translation": { - "keywords": [ - "kukhatsateka", - "khatsatekile" - ], - "mispellings": [ - "khatsata", - "kukhstatateka", - "kukkhatateka" - ] - } - }, - { - "English": { - "keywords": [ - "distress", - "distressed" - ], - "mispellings": [ - "destressed", - "destresss" - ] - }, - "Translation": { - "keywords": [ - "lusizi", - "kuba nelusizi" - ], - "mispellings": [ - "luzisi", - "luzizi" - ] - } - }, - { - "English": { - "keywords": [ - "sad", - "saddened" - ], - "mispellings": [ - "sadd", - "saddenned", - "sadend" - ] - }, - "Translation": { - "keywords": [ - "dzabuka", - "dzabukile" - ], - "mispellings": [ - "dzabbuka", - "dxabukile", - "kudzabka" - ] - } - }, - { - "English": { - "keywords": [ - "scare", - "scared" - ], - "mispellings": [ - "csared", - "csare" - ] - }, - "Translation": { - "keywords": [ - "kusabisa", - "kwesaba" - ], - "mispellings": [ - "ssaba", - "khwesaba" - ] - } - }, - { - "English": { - "keywords": [ - "afraid" - ], - "mispellings": [ - "afaid" - ] - }, - "Translation": { - "keywords": [ - "kwesaba" - ], - "mispellings": [ - "khwesaba" - ] - } - }, - { - "English": { - "keywords": [ - "worried" - ], - "mispellings": [ - "woried" - ] - }, - "Translation": { - "keywords": [ - "khatsatekile" - ], - "mispellings": [ - "kkhastatekile" - ] - } - }, - { - "English": { - "keywords": [ - "fear", - "frighten" - ], - "mispellings": [ - "fer", - "fire" - ] - }, - "Translation": { - "keywords": [ - "kwesaba", - "sabisa" - ], - "mispellings": [ - "kisaba", - "ftusa" - ] - } - }, - { - "English": { - "keywords": [ - "panic", - "panicking", - "panicked" - ], - "mispellings": [ - "panik", - "paniked", - "paniking" - ] - }, - "Translation": { - "keywords": [ - "kutfuka", - "kutfuka", - "tfuka" - ], - "mispellings": [ - "kuftuka", - "kwetfuka", - "chachatela" - ] - } - }, - { - "English": { - "keywords": [ - "mad" - ], - "mispellings": [ - "md", - "made" - ] - }, - "Translation": { - "keywords": [ - "kuhlanya" - ], - "mispellings": [ - "hllanya" - ] - } - }, - { - "English": { - "keywords": [ - "grief" - ], - "mispellings": [ - "greif", - "griev" - ] - }, - "Translation": { - "keywords": [ - "lusizi" - ], - "mispellings": [ - "luaizi", - "luizi" - ] - } - } - ], - "natural_disasters": [ - { - "English": { - "keywords": [ - "fire", - "wildfire" - ], - "mispellings": [ - "blaze", - "burnt", - "hot", - "fir" - ] - }, - "Translation": { - "keywords": [ - "umlilo", - "umlilo wasendle" - ], - "mispellings": [ - "langabi", - "shile", - "shisa", - "umlillo" - ] - } - }, - { - "English": { - "keywords": [ - "earthquake", - "Landslide", - "collapse" - ], - "mispellings": [ - "shake", - "irthquake", - "earthquak", - "fall" - ] - }, - "Translation": { - "keywords": [ - "kutamatama kwemhlaba", - "Kugedvuka kwemhlaba", - "kuwela phansi" - ], - "mispellings": [ - "kuzamazama", - "kutamatama kwemhlabe", - "kuwa" - ] - } - }, - { - "English": { - "keywords": [ - "flood", - "flooded", - "flooding", - "monsoon" - ], - "mispellings": [ - "flud", - "flod", - "munson" - ] - }, - "Translation": { - "keywords": [ - "sikhukhula", - "kunesikhukhula", - "kukhukhula", - "imvula" - ], - "mispellings": [ - "sihkukhula", - "sikhulula", - "imvala" - ] - } - }, - { - "English": { - "keywords": [ - "smoke", - "smoked" - ], - "mispellings": [ - "smooke", - "smok" - ] - }, - "Translation": { - "keywords": [ - "intfutfu", - "kunentfutfu" - ], - "mispellings": [ - "bhunya", - "futsa" - ] - } - }, - { - "English": { - "keywords": [ - "typhoon", - "hurricane", - "tornado", - "cyclone" - ], - "mispellings": [ - "storm", - "rain", - "water", - "stormy" - ] - }, - "Translation": { - "keywords": [ - "inkanyamba", - "siphephe", - "sishingishane", - "siphepho" - ], - "mispellings": [ - "sipepho", - "imnvula", - "emanti", - "phephetsa" - ] - } - }, - { - "English": { - "keywords": [ - "droughts", - "dry", - "aridity" - ], - "mispellings": [ - "dri" - ] - }, - "Translation": { - "keywords": [ - "tomiso", - "komile", - "lugwadvule" - ], - "mispellings": [ - "komisa" - ] - } - }, - { - "English": { - "keywords": [ - "volcanic eruption", - "volcano" - ], - "mispellings": [ - "fire", - "hot" - ] - }, - "Translation": { - "keywords": [ - "kukhafula umlilo", - "sikhafulamlilo" - ], - "mispellings": [ - "mlilo", - "siyalu semlilo" - ] - } - }, - { - "English": { - "keywords": [ - "wind", - "blast", - "windy" - ], - "mispellings": [ - "air" - ] - }, - "Translation": { - "keywords": [ - "umoya lohhushako", - "umoya lonemandla", - "kuhhusha" - ], - "mispellings": [ - "umoya" - ] - } - } - ], - "violence": [ - { - "English": { - "keywords": [ - "hit", - "hitting", - "hits" - ], - "mispellings": [ - "hitt", - "hitin", - "hittin" - ] - }, - "Translation": { - "keywords": [ - "shaya", - "kushaya", - "kushaya" - ], - "mispellings": [ - "hsaya", - "yashay", - "shhaya" - ] - } - }, - { - "English": { - "keywords": [ - "beat", - "beating", - "beats" - ], - "mispellings": [ - "beetin", - "beeting", - "beets", - "bet" - ] - }, - "Translation": { - "keywords": [ - "shaya", - "ushaya", - "ushaya" - ], - "mispellings": [ - "shayy", - "betsa", - "kubetsa", - "bets" - ] - } - }, - { - "English": { - "keywords": [ - "abuse", - "abusive", - "abusing", - "Abused" - ], - "mispellings": [ - "abuzin", - "abuz", - "buse" - ] - }, - "Translation": { - "keywords": [ - "hlukubeta", - "lohlukubetako", - "yahlukubeta", - "hlukubetiwe" - ], - "mispellings": [ - "hlukumeta", - "yahlukumeta", - "hlkbeta" - ] - } - }, - { - "English": { - "keywords": [ - "violent", - "violence" - ], - "mispellings": [ - "violnce", - "vilent" - ] - }, - "Translation": { - "keywords": [ - "unebudlova", - "budlova" - ], - "mispellings": [ - "nebuldova", - "bldova" - ] - } - }, - { - "English": { - "keywords": [ - "pain", - "painful", - "pains", - "paining" - ], - "mispellings": [ - "pen", - "poen" - ] - }, - "Translation": { - "keywords": [ - "buhlungu", - "kubuhlungu", - "tinhlungu", - "kuvisa buhlungu" - ], - "mispellings": [ - "hlungy", - "buhhlung" - ] - } - }, - { - "English": { - "keywords": [ - "blood", - "bleeding", - "bled" - ], - "mispellings": [ - "blud", - "bleding", - "bleedding" - ] - }, - "Translation": { - "keywords": [ - "ingati", - "kopha", - "ophile" - ], - "mispellings": [ - "ingaati", - "bopha", - "yopha" - ] - } - }, - { - "English": { - "keywords": [ - "bruise", - "bruising", - "bruised" - ], - "mispellings": [ - "brusing", - "brused", - "broosed" - ] - }, - "Translation": { - "keywords": [ - "umhuzu", - "kuhuzula", - "huzuliwe" - ], - "mispellings": [ - "kuhazuka", - "kuhuzla", - "kuhazukile" - ] - } - }, - { - "English": { - "keywords": [ - "hurt", - "hurting", - "hurts" - ], - "mispellings": [ - "hurted", - "hurrt" - ] - }, - "Translation": { - "keywords": [ - "limata", - "kulimata", - "kulimata" - ], - "mispellings": [ - "linata", - "limmata" - ] - } - }, - { - "English": { - "keywords": [ - "ache", - "aching", - "ached" - ], - "mispellings": [ - "acked" - ] - }, - "Translation": { - "keywords": [ - "buhlungu", - "kuva buhlungu", - "uvise buhlungu" - ], - "mispellings": [ - "buhllung" - ] - } - }, - { - "English": { - "keywords": [ - "injury", - "injured" - ], - "mispellings": [ - "inguries" - ] - }, - "Translation": { - "keywords": [ - "ingoti yekulimala", - "benengoti yekulimala" - ], - "mispellings": [ - "inkoti" - ] - } - }, - { - "English": { - "keywords": [ - "cut", - "cutting" - ], - "mispellings": [ - "cutt" - ] - }, - "Translation": { - "keywords": [ - "kusika", - "kusika" - ], - "mispellings": [ - "aika" - ] - } - }, - { - "English": { - "keywords": [ - "harm", - "harming", - "harmful", - "harmed" - ], - "mispellings": [ - "harrm", - "harme", - "haarm", - "harmng" - ] - }, - "Translation": { - "keywords": [ - "hlupha", - "kuhlupha", - "yahlupha", - "hluphile" - ], - "mispellings": [ - "hlupa", - "hluphe", - "hluupha", - "hllupha" - ] - } - }, - { - "English": { - "keywords": [ - "slap", - "slapped", - "slapping" - ], - "mispellings": [ - "slip", - "slappping", - "slaped", - "slape" - ] - }, - "Translation": { - "keywords": [ - "kushaya ngemphama", - "ushaye ngemphama", - "kushaya ngemphama" - ], - "mispellings": [ - "kushaywe ngemphama", - "kushaywa ngemphama", - "kushhaya ngemphama", - "ushaya ngemphama" - ] - } - }, - { - "English": { - "keywords": [ - "punch", - "punched", - "punching", - "punches" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "shay ngesibhakela", - "shaye ngesibhakela", - "kushaya ngesibhakela", - "tibhakela" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "kill", - "killing", - "killed", - "kills" - ], - "mispellings": [ - "cill", - "cills", - "kilin" - ] - }, - "Translation": { - "keywords": [ - "bulala", - "kubulala", - "bulele", - "yabulala" - ], - "mispellings": [ - "bullala", - "yavulala", - "bulal" - ] - } - }, - { - "English": { - "keywords": [ - "murder", - "murderer", - "murdering", - "murdered" - ], - "mispellings": [ - "merdered", - "murd", - "mrdered" - ] - }, - "Translation": { - "keywords": [ - "kubulala", - "umbulali", - "kubulala", - "bulewe" - ], - "mispellings": [ - "bulelwe", - "vulllelwe", - "kbulelwe" - ] - } - }, - { - "English": { - "keywords": [ - "bang", - "banging", - "banged" - ], - "mispellings": [ - "baning", - "bangin", - "pang" - ] - }, - "Translation": { - "keywords": [ - "shaya kakhulu", - "kushaya kakhulu", - "shaye kakhulu" - ], - "mispellings": [ - "betsa", - "shaya kakhhulu", - "shayya kakhul" - ] - } - }, - { - "English": { - "keywords": [ - "blow" - ], - "mispellings": [ - "below", - "blo", - "bow" - ] - }, - "Translation": { - "keywords": [ - "sibhakela" - ], - "mispellings": [ - "sibakela", - "sbakela", - "sibbakela" - ] - } - }, - { - "English": { - "keywords": [ - "cane" - ], - "mispellings": [ - "stick", - "can" - ] - }, - "Translation": { - "keywords": [ - "umzaca" - ], - "mispellings": [ - "luswati", - "angahle" - ] - } - }, - { - "English": { - "keywords": [ - "belt" - ], - "mispellings": [ - "girdle", - "strap" - ] - }, - "Translation": { - "keywords": [ - "libhande" - ], - "mispellings": [ - "licotfo", - "libande" - ] - } - }, - { - "English": { - "keywords": [ - "stab", - "stabbing", - "stabbed" - ], - "mispellings": [ - "sabbed", - "satb", - "stabing" - ] - }, - "Translation": { - "keywords": [ - "gwaza", - "kugwaza", - "gwazile" - ], - "mispellings": [ - "hlaba", - "fwazile", - "kugwaz" - ] - } - }, - { - "English": { - "keywords": [ - "scream", - "screaming", - "screamed" - ], - "mispellings": [ - "screem", - "creamed", - "screemed" - ] - }, - "Translation": { - "keywords": [ - "memeta", - "kumemeta", - "memetile" - ], - "mispellings": [ - "nemeta", - "mmeemeta", - "mmemetile" - ] - } - }, - { - "English": { - "keywords": [ - "push", - "pushing", - "pushed" - ], - "mispellings": [ - "puss", - "pusing" - ] - }, - "Translation": { - "keywords": [ - "fuca", - "yafuca", - "fucile" - ], - "mispellings": [ - "phusha", - "fucca" - ] - } - }, - { - "English": { - "keywords": [ - "shove", - "shoving", - "shoved" - ], - "mispellings": [ - "shoove", - "shooved", - "shov" - ] - }, - "Translation": { - "keywords": [ - "sundvuta", - "kusundvuta", - "sundvutile" - ], - "mispellings": [ - "sudvuta", - "sundvvuta", - "suvvuta" - ] - } - }, - { - "English": { - "keywords": [ - "kick", - "kicking", - "kicked" - ], - "mispellings": [ - "kiked", - "kik", - "kickin" - ] - }, - "Translation": { - "keywords": [ - "khahlela", - "kukhahlela", - "khahlele" - ], - "mispellings": [ - "khalhela", - "khahlel", - "khahllela" - ] - } - }, - { - "English": { - "keywords": [ - "strike", - "striking", - "stricked" - ], - "mispellings": [ - "strikd", - "strik", - "stricke" - ] - }, - "Translation": { - "keywords": [ - "shaya", - "yashaya", - "shayile" - ], - "mispellings": [ - "shay", - "shayy", - "shayiwe" - ] - } - }, - { - "English": { - "keywords": [ - "torture", - "tortured" - ], - "mispellings": [ - "tortue", - "tortued", - "torured" - ] - }, - "Translation": { - "keywords": [ - "visa buhlungu", - "kuvisa buhlungu" - ], - "mispellings": [ - "visa bulhungu", - "visa buhlung", - "buhllung" - ] - } - }, - { - "English": { - "keywords": [ - "throw", - "throwing", - "throws" - ], - "mispellings": [ - "trow", - "trowing", - "trows" - ] - }, - "Translation": { - "keywords": [ - "jika", - "kujika", - "yajika" - ], - "mispellings": [ - "yika", - "yayika", - "yyika" - ] - } - }, - { - "English": { - "keywords": [ - "rape", - "raping", - "rapist" - ], - "mispellings": [ - "rapeing", - "rapping", - "raipe", - "rap" - ] - }, - "Translation": { - "keywords": [ - "dlwengula", - "kudlwengula", - "umdlwenguli" - ], - "mispellings": [ - "dlwengulla", - "kuddlwengula", - "kudlengula", - "dlwengula" - ] - } - }, - { - "English": { - "keywords": [ - "violate", - "violation", - "violated" - ], - "mispellings": [ - "violat", - "vilate", - "volated" - ] - }, - "Translation": { - "keywords": [ - "yona", - "kona", - "oniwe" - ], - "mispellings": [ - "yonna", - "yoona", - "yonniwe" - ] - } - }, - { - "English": { - "keywords": [ - "sex", - "sexual", - "sexually", - "sext" - ], - "mispellings": [ - "sexting", - "xxx", - "ex" - ] - }, - "Translation": { - "keywords": [ - "kulalana", - "kwekulalana", - "ngekulalana", - "umlayeto ngekulalana" - ], - "mispellings": [ - "kutfumelelana titfombe letikhombisa kungcola", - "i-xxx", - "i-ex" - ] - } - }, - { - "English": { - "keywords": [ - "force", - "forced", - "forcing" - ], - "mispellings": [ - "forc", - "forcin" - ] - }, - "Translation": { - "keywords": [ - "phoca", - "phocile", - "kuphoca" - ], - "mispellings": [ - "phoc", - "kuphoc" - ] - } - }, - { - "English": { - "keywords": [ - "pressure", - "pressured", - "pressuring" - ], - "mispellings": [ - "perssure", - "perssured", - "presssure" - ] - }, - "Translation": { - "keywords": [ - "cindzetela", - "cindzetelwe", - "kucindzetela" - ], - "mispellings": [ - "chindzetela", - "chindetelwe", - "kucinndetela" - ] - } - }, - { - "English": { - "keywords": [ - "coercive", - "coerce", - "coerced" - ], - "mispellings": [ - "coersive", - "corrisive" - ] - }, - "Translation": { - "keywords": [ - "ngekuphoca", - "kuphoca", - "phociwe" - ], - "mispellings": [ - "poca", - "kupphoca" - ] - } - }, - { - "English": { - "keywords": [ - "misconduct", - "inappropriate", - "unacceptable" - ], - "mispellings": [ - "miscund", - "inappropiate" - ] - }, - "Translation": { - "keywords": [ - "kutiphatsa kabi", - "lokungalungi", - "lokungemukeleki" - ], - "mispellings": [ - "kutiphasta kabi", - "lokungakungi" - ] - } - }, - { - "English": { - "keywords": [ - "harassment", - "stalking", - "perpetration" - ], - "mispellings": [ - "stolk" - ] - }, - "Translation": { - "keywords": [ - "kuhlukubeta", - "kutuma", - "kwenta bubi" - ], - "mispellings": [ - "kuhlupha" - ] - } - }, - { - "English": { - "keywords": [ - "blame", - "blaming" - ], - "mispellings": [ - "blam" - ] - }, - "Translation": { - "keywords": [ - "sola", - "kusola" - ], - "mispellings": [ - "sol" - ] - } - } - ], - "drugs": [ - { - "English": { - "keywords": [ - "drugs", - "drug" - ], - "mispellings": [ - "drog", - "drogs" - ] - }, - "Translation": { - "keywords": [ - "tidzakamiva", - "sidzakamiva" - ], - "mispellings": [ - "sidzakwa", - "tidakamiva" - ] - } - }, - { - "English": { - "keywords": [ - "addict", - "addiction", - "dependence" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "umhuci", - "kuhuca", - "kuphila ngetidzakamiva" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "cocaine", - "coke", - "blow", - "charlie" - ], - "mispellings": [ - "crack", - "white", - "koke" - ] - }, - "Translation": { - "keywords": [ - "i-cocaine", - "i-coke", - "i-blow", - "i-charlie" - ], - "mispellings": [ - "sihlahla", - "i-white", - "i-koke" - ] - } - }, - { - "English": { - "keywords": [ - "heroine", - "brown", - "smack", - "skag" - ], - "mispellings": [ - "hose", - "hero" - ] - }, - "Translation": { - "keywords": [ - "i-horoine", - "i-brown", - "i-smack", - "i-skag" - ], - "mispellings": [ - "i-hose", - "i-hero" - ] - } - }, - { - "English": { - "keywords": [ - "high", - "tipsy" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "kudzakwa", - "dzakiwe" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "overdose", - "overdosed" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "kwecisa", - "ecisile" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "LSD", - "acid", - "cheer", - "drop" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "i-LSD", - "i-acid", - "i-cheer", - "i-drop" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "magic mushrooms", - "alice" - ], - "mispellings": [ - "shrooms" - ] - }, - "Translation": { - "keywords": [ - "i-magic mushrooms", - "i-alice" - ], - "mispellings": [ - "makhowe" - ] - } - }, - { - "English": { - "keywords": [ - "alcohol", - "booze", - "drinks", - "alcoholic" - ], - "mispellings": [ - "hooch", - "alcol" - ] - }, - "Translation": { - "keywords": [ - "tjwala", - "tjwala", - "tinatfo", - "kwetjwala" - ], - "mispellings": [ - "emanti lamponjwana", - "utshwal" - ] - } - }, - { - "English": { - "keywords": [ - "speed", - "billy", - "paste", - "base" - ], - "mispellings": [ - "whizz" - ] - }, - "Translation": { - "keywords": [ - "i-speed", - "i-billy", - "i-pasta", - "i-base" - ], - "mispellings": [ - "sipidi" - ] - } - }, - { - "English": { - "keywords": [ - "poppers", - "liquid gold", - "jungle juice", - "rush" - ], - "mispellings": [ - "buzz" - ] - }, - "Translation": { - "keywords": [ - "i-poppers", - "i-liquid gold", - "i-jungle juice", - "i-rush" - ], - "mispellings": [ - "i-buzz" - ] - } - }, - { - "English": { - "keywords": [ - "weed", - "cannabis", - "Marijuana", - "dope" - ], - "mispellings": [ - "grass", - "pot", - "CBD", - "THC", - "bud", - "mariguana" - ] - }, - "Translation": { - "keywords": [ - "insangu", - "i-cannabis", - "insangu", - "i-dope" - ], - "mispellings": [ - "sihlahla", - "tjani", - "i-CBD", - "i-THC", - "i-bud", - "i-mariguana" - ] - } - }, - { - "English": { - "keywords": [ - "ectasy", - "brownies", - "crystal", - "mandy" - ], - "mispellings": [ - "md", - "mdma", - "molly", - "pills" - ] - }, - "Translation": { - "keywords": [ - "i-ectasy", - "i-brownies", - "i-crystal", - "i-mandy" - ], - "mispellings": [ - "i-md", - "mdma", - "i-molly", - "emaphilisi" - ] - } - }, - { - "English": { - "keywords": [ - "crystal meth", - "ice", - "meth", - "speed" - ], - "mispellings": [ - "crank" - ] - }, - "Translation": { - "keywords": [ - "i-crystal meth", - "i-ice", - "i-meth", - "i-speed" - ], - "mispellings": [ - "i-crank" - ] - } - }, - { - "English": { - "keywords": [ - "ketamine" - ], - "mispellings": [ - "ket" - ] - }, - "Translation": { - "keywords": [ - "i-ketamine", - "-", - "-", - "-" - ], - "mispellings": [ - "khethamine" - ] - } - }, - { - "English": { - "keywords": [ - "Morphine" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "i-Morphine" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "Oxycodone", - "Oxy", - "Oxycotton" - ], - "mispellings": [] - }, - "Translation": { - "keywords": [ - "i-Oxycodone", - "i-Oxy", - "i-Oxycotton" - ], - "mispellings": [] - } - }, - { - "English": { - "keywords": [ - "rehabilitation", - "detoxification" - ], - "mispellings": [ - "rehab" - ] - }, - "Translation": { - "keywords": [ - "kubuyesela esimeni", - "kukhipha tidzakamiva" - ], - "mispellings": [ - "kubuyiselwa esimeni" - ] - } - } - ] -} \ No newline at end of file diff --git a/src/parenttext_pipeline/steps.py b/src/parenttext_pipeline/steps.py index aa129b3..f7fe13a 100644 --- a/src/parenttext_pipeline/steps.py +++ b/src/parenttext_pipeline/steps.py @@ -1,31 +1,392 @@ import json import os +import shutil from copy import copy from pathlib import Path +import rpft.converters +from rapidpro_abtesting.main import apply_abtests +from rpft.logger.logger import initialize_main_logger + +from parenttext_pipeline.common import ( + get_full_step_files_dict, + get_full_step_files_list, + get_input_subfolder, + make_output_filepath, + run_node, +) from parenttext_pipeline.extract_keywords import batch -def update_expiration_time(config, source, in_fp): - with open(config.special_expiration, "r") as specifics_json: - specifics = json.load(specifics_json) +def load_flows(config, step_config, step_number, _=None): + step_output_file = make_output_filepath(config, f"_{step_number}.json") + + files = get_full_step_files_list(config, step_config) + if len(files) != 1: + raise NotImplementedError( + "load_flows must have exactly one file as input (until we support merging)" + ) + + shutil.copyfile(files[0], step_output_file) + return step_output_file + + +def create_flows(config, step_config, step_number, _=None): + step_output_file = make_output_filepath( + config, f"_{step_number}_load_from_sheets.json" + ) + + sheets = get_full_step_files_list(config, step_config) + + initialize_main_logger(Path(config.temppath) / "rpft.log") + flows = rpft.converters.create_flows( + sheets, + None, + "json", + data_models=step_config.models_module, + tags=step_config.tags, + ) + with open(step_output_file, "w") as export: + json.dump(flows, export, indent=4) + + return step_output_file + + +def apply_edits(config, step_config, step_number, step_input_file): + step_name = step_config.id + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + + input_sheets = get_full_step_files_list(config, step_config) + log_file_path = os.path.join(config.temppath, f"{step_name}.log") + + apply_abtests( + step_input_file, + step_output_file, + input_sheets, + sheet_format="json", + logfile=log_file_path, + ) + + return step_output_file + + +def apply_qr_treatment(config, step_config, step_number, step_input_file): + step_name = step_config.id + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + # This is redundant, and we should change the JS script to just take the output + # filename instead of step_output_basename and config.temppath as arguments + step_output_basename = f"{config.flows_outputbasename}_{step_number}_{step_name}" + + files = get_full_step_files_dict(config, step_config) + select_phrases_file = files.get("select_phrases_file") + special_words_file = files.get("special_words_file") + if select_phrases_file is None or special_words_file is None: + raise ValueError( + "qr_treatment sources must reference a select_phrases_file " + "and a special_words_file" + ) + + # We can do different things to our quick replies depending on the deployment + # channel + if step_config.qr_treatment == "move": + run_node( + "idems_translation_chatbot/index.js", + "move_quick_replies", + step_input_file, + select_phrases_file, + step_output_basename, + config.temppath, + step_config.add_selectors, + str(step_config.qr_limit), + special_words_file, + ) + print("Step 8 complete, removed quick replies") + elif step_config.qr_treatment == "move_and_mod": + run_node( + "idems_translation_chatbot/index.js", + "move_and_mod_quick_replies", + step_input_file, + select_phrases_file, + step_config.replace_phrases, + step_output_basename, + config.temppath, + step_config.add_selectors, + str(step_config.qr_limit), + special_words_file, + ) + print("Step 8 complete, removed and modified quick replies") + elif step_config.qr_treatment == "reformat": + run_node( + "idems_translation_chatbot/index.js", + "reformat_quick_replies", + step_input_file, + select_phrases_file, + step_output_basename, + config.temppath, + step_config.count_threshold, + step_config.length_threshold, + str(step_config.qr_limit), + special_words_file, + ) + print("Step 8 complete, reformatted quick replies") + elif step_config.qr_treatment == "reformat_whatsapp": + run_node( + "idems_translation_chatbot/index.js", + "reformat_quick_replies_whatsapp", + step_input_file, + select_phrases_file, + step_output_basename, + config.temppath, + str(step_config.qr_limit), + special_words_file, + ) + print("Step 8 complete, reformatted quick replies") + elif step_config.qr_treatment == "reformat_china": + run_node( + "idems_translation_chatbot/index.js", + "reformat_quick_replies_china", + step_input_file, + select_phrases_file, + step_output_basename, + config.temppath, + step_config.count_threshold, + step_config.length_threshold, + str(step_config.qr_limit), + special_words_file, + ) + print("Step 8 complete, reformatted quick replies to China standard") + elif step_config.qr_treatment == "wechat": + run_node( + "idems_translation_chatbot/index.js", + "convert_qr_to_html", + step_input_file, + step_output_basename, + config.temppath, + ) + print("Step 8 complete, moved quick replies to html") + else: + step_output_file = step_input_file + print("Step 8 skipped, no QR edits specified") + + return step_output_file + + +def apply_safeguarding(config, step_config, step_number, step_input_file): + step_name = step_config.id + + if len(step_config.sources) != 1: + raise ValueError("safeguarding step must have exactly one source") + source_name = step_config.sources[0] + # source_config = get_source_config(config, source_name, step_name) + step_input_path = get_input_subfolder(config, source_name) + safeguarding_file_path = step_input_path / "safeguarding_words.json" + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + + # We may apply both of these operations. + if step_config.flow_name and step_config.flow_uuid: + run_node( + "safeguarding-rapidpro/v2_add_safeguarding_to_flows.js", + step_input_file, + str(safeguarding_file_path), + step_output_file, + step_config.flow_uuid, + step_config.flow_name, + ) + step_input_file = step_output_file + print("Safeguarding flows added") + + if step_config.redirect_flow_names: + run_node( + "safeguarding-rapidpro/v2_edit_redirect_flow.js", + step_input_file, + str(safeguarding_file_path), + step_output_file, + step_config.redirect_flow_names, + ) + print("Redirect flows edited") + + return step_output_file + + +def apply_translations(config, step_config, step_number, step_input_file): + step_name = step_config.id + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + # This is redundant, and we should change the JS script to just take the output + # filename instead of step_output_basename and config.temppath as arguments + step_output_basename = f"{config.flows_outputbasename}_{step_number}_{step_name}" + + merge_translation_jsons(config, step_config) + + for lang in step_config.languages: + json_translation_path = os.path.join( + config.temppath, step_name, lang["code"], "merged_translations.json" + ) + + run_node( + "idems_translation_chatbot/index.js", + "localize", + step_input_file, + json_translation_path, + lang["language"], + step_output_basename, + config.temppath, + ) + + step_input_file = step_output_file + + return step_output_file + + +def apply_has_any_word_check(config, step_config, step_number, step_input_file): + step_name = step_config.id + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + # This is redundant, and we should change the JS script to just take the output + # filename instead of step_output_basename and config.temppath as arguments + step_output_basename = f"{config.flows_outputbasename}_{step_number}_{step_name}" + # This is inconsistent. Why not specify the output filename? + has_any_words_log = f"{step_number}_{step_name}" + + run_node( + "idems_translation_chatbot/index.js", + "has_any_words_check", + step_input_file, + config.temppath, + step_output_basename, + has_any_words_log, + ) + + return step_output_file + + +def apply_overall_integrity_check(config, step_config, step_number, step_input_file): + step_name = step_config.id + # This is inconsistent. Why not specify the output filename? + integrity_log = f"{step_number}_{step_name}" + excel_log_name = os.path.join(config.temppath, f"{step_number}_{step_name}.xlsx") + + run_node( + "idems_translation_chatbot/index.js", + "overall_integrity_check", + step_input_file, + config.temppath, + integrity_log, + excel_log_name, + ) + + return None + + +def apply_fix_arg_qr_translation(config, step_config, step_number, step_input_file): + step_name = step_config.id + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + # This is redundant, and we should change the JS script to just take the output + # filename instead of step_output_basename and config.temppath as arguments + step_output_basename = f"{config.flows_outputbasename}_{step_number}_{step_name}" + # This is inconsistent. Why not specify the output filename? + fix_arg_qr_log = f"{step_number}_{step_name}" + + run_node( + "idems_translation_chatbot/index.js", + "fix_arg_qr_translation", + step_input_file, + config.temppath, + step_output_basename, + fix_arg_qr_log, + ) + + return step_output_file + + +def apply_extract_texts_for_translators( + config, step_config, step_number, step_input_file +): + step_name = step_config.id + # This is redundant, and we should change the JS script to just take the output + # filename instead of step_output_basename and config.temppath as arguments + step_translation_basename = ( + f"{config.flows_outputbasename}_{step_number}_{step_name}" + ) + step_translation_file = os.path.join( + config.temppath, step_translation_basename + ".json" + ) + + # Setup output file to send to translators if it doesn't exist + translator_output_folder = os.path.join(config.outputpath, "send_to_translators") + if not os.path.exists(translator_output_folder): + os.makedirs(translator_output_folder) + translation_output_file = os.path.join( + translator_output_folder, f"{config.flows_outputbasename}_crowdin.pot" + ) + + # Produce translatable strings in json format + run_node( + "idems_translation_chatbot/index.js", + "extract_simple", + step_input_file, + config.temppath, + step_translation_basename, + ) + + # Convert to pot + run_node( + "idems_translation_common/index.js", + "convert", + step_translation_file, + translation_output_file, + ) + + return None + + +def merge_translation_jsons(config, step_config): + step_name = step_config.id + if len(step_config.sources) != 1: + raise ValueError("translation step must have exactly one source") + source_name = step_config.sources[0] + + for lang in step_config.languages: + translations_input_folder = Path(config.inputpath) / source_name / lang["code"] + translations_temp_folder = Path(config.temppath) / step_name / lang["code"] + os.makedirs(translations_temp_folder, exist_ok=True) + + # Merge all translation files into a single JSON that we can localise back into + # our flows + run_node( + "idems_translation_common/index.js", + "concatenate_json", + translations_input_folder, + translations_temp_folder, + "merged_translations.json", + ) - with open(in_fp, "r") as in_json: + +def update_expiration_times(config, step_config, step_number, step_input_file): + step_name = step_config.id + step_output_file = make_output_filepath(config, f"_{step_number}_{step_name}.json") + + if not step_config.sources: + specifics = {} + else: + files = get_full_step_files_dict(config, step_config) + special_expiration_filepath = files.get("special_expiration_file") + if special_expiration_filepath is None: + raise ValueError( + "update_expiration_times sources must reference " + "a special_expiration_file" + ) + with open(special_expiration_filepath, "r") as specifics_json: + specifics = json.load(specifics_json) + + with open(step_input_file, "r") as in_json: org = json.load(in_json) for flow in org.get("flows", []): - set_expiration(flow, config.default_expiration, specifics) + set_expiration(flow, step_config.default_expiration_time, specifics) - out_fp = os.path.join( - config.outputpath, - source["filename"] + "_1_2_modified_expiration_times.json", - ) - with open(out_fp, "w") as out_json: + with open(step_output_file, "w") as out_json: json.dump(org, out_json) - print("Expiration times modified") - - return out_fp + return step_output_file def set_expiration(flow, default, specifics={}): @@ -39,14 +400,17 @@ def set_expiration(flow, default, specifics={}): return flow -def split_rapidpro_json(config, source, in_fp): - n = source.get("split_no", 1) - - if n < 2: - print(f"File splitting skipped, batch_count={n}") +def split_rapidpro_json(config, input_filename): + n = config.output_split_number + assert isinstance(n, int) and n >= 1 + if n == 1: + output_filename = ( + Path(config.outputpath) / f"{config.flows_outputbasename}.json" + ) + shutil.copyfile(input_filename, output_filename) return - with open(in_fp, 'r', encoding='utf-8') as in_json: + with open(input_filename, "r", encoding="utf-8") as in_json: org = json.load(in_json) flows_per_file = len(org["flows"]) // n @@ -68,12 +432,19 @@ def split_rapidpro_json(config, source, in_fp): ], } ) - in_path = Path(in_fp) - out_fp = in_path.with_stem(in_path.stem + "_" + str(i)) + output_filename = ( + Path(config.outputpath) / f"{config.flows_outputbasename}_{i}.json" + ) - with open(out_fp, "w") as out_file: + with open(output_filename, "w") as out_file: json.dump(org_new, out_file, indent=2) - print(f"File written, path={out_fp}") + print(f"File written, path={output_filename}") + + +def write_diffable(config, input_filename, subfolder="diffable"): + output_subfolder = Path(config.outputpath) / subfolder + os.makedirs(output_subfolder, exist_ok=True) + rpft.converters.flows_to_sheets(input_filename, output_subfolder, strip_uuids=True) def edit_campaign(campaign, flows):