From d3b9a169691ca3df2e45051e4c60374339150930 Mon Sep 17 00:00:00 2001 From: "pixeebot[bot]" <104101892+pixeebot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 16:23:08 -0400 Subject: [PATCH 1/3] Add timeout to `requests` calls (#5) Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --- backend/oasst_backend/utils/discord.py | 2 +- data/datasets/biostars_qa/get_biostars_dataset.py | 2 +- data/datasets/oa_dolly_15k/create_dataset.py | 2 +- data/datasets/oa_stackexchange/download.py | 4 ++-- data/datasets/youtube_subs_howto100M/prepare.py | 2 +- data/datasets/zhihu-kol/main.py | 6 +++--- data/datasets/zhihu-kol/scrape_by_topic.py | 2 +- inference/worker/chat_chain_utils.py | 4 ++-- inference/worker/openapi_parser.py | 4 ++-- inference/worker/utils.py | 4 ++-- model/model_training/custom_datasets/prompt_dialogue.py | 2 +- model/model_training/custom_datasets/qa_datasets.py | 4 ++-- scripts/data_augment/data_augment.py | 4 ++-- text-frontend/__main__.py | 6 +++--- text-frontend/auto_main.py | 6 +++--- 15 files changed, 27 insertions(+), 27 deletions(-) diff --git a/backend/oasst_backend/utils/discord.py b/backend/oasst_backend/utils/discord.py index ccadcc077f..b8be607dc5 100644 --- a/backend/oasst_backend/utils/discord.py +++ b/backend/oasst_backend/utils/discord.py @@ -59,7 +59,7 @@ def send_new_report_message(message_details: dict, label_text: str, user_id: UUI "content": f"New flagged message https://open-assistant.io/admin/messages/{message_details['message_id']}", "embeds": [message_content_embed, label_text_embed], }, - ) + timeout=60) res.raise_for_status() except Exception as e: logger.exception(f"Failed to send flagged message. error: {e}") diff --git a/data/datasets/biostars_qa/get_biostars_dataset.py b/data/datasets/biostars_qa/get_biostars_dataset.py index 135626048f..17ae220343 100644 --- a/data/datasets/biostars_qa/get_biostars_dataset.py +++ b/data/datasets/biostars_qa/get_biostars_dataset.py @@ -41,7 +41,7 @@ def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, print(f"MSG: {file} exists. Skipping; Current accepted: {has_accepted_count}") continue - r = requests.get(url, headers=headers) + r = requests.get(url, headers=headers, timeout=60) # print(r.status_code, r.reason) diff --git a/data/datasets/oa_dolly_15k/create_dataset.py b/data/datasets/oa_dolly_15k/create_dataset.py index 8e07571a70..e1f1f75581 100644 --- a/data/datasets/oa_dolly_15k/create_dataset.py +++ b/data/datasets/oa_dolly_15k/create_dataset.py @@ -9,7 +9,7 @@ def download_data(url: str, destination: str): - response = requests.get(url, stream=True) + response = requests.get(url, stream=True, timeout=60) with open(destination, "wb") as handle: for data in response.iter_content(): diff --git a/data/datasets/oa_stackexchange/download.py b/data/datasets/oa_stackexchange/download.py index e477061f58..b90c2579f2 100755 --- a/data/datasets/oa_stackexchange/download.py +++ b/data/datasets/oa_stackexchange/download.py @@ -27,7 +27,7 @@ def get_all_filenames(): This needs quite some mangling because of special cases (i.e. stackoverflow is not in one 7z archive). Ignore meta files. """ - response = requests.get("https://archive.org/download/stackexchange") + response = requests.get("https://archive.org/download/stackexchange", timeout=60) if response.ok: soup = bs(response.content, "html.parser") table = soup.find("table") @@ -50,7 +50,7 @@ def download_url(dataset_name: str, url: str): return cache_path else: print("Downloading xml: ", dataset_name) - response = requests.get(url) + response = requests.get(url, timeout=60) print("Finished downloading: ", dataset_name) with open(cache_path, "wb") as f: f.write(response.content) diff --git a/data/datasets/youtube_subs_howto100M/prepare.py b/data/datasets/youtube_subs_howto100M/prepare.py index 738beef78d..78faee1a36 100644 --- a/data/datasets/youtube_subs_howto100M/prepare.py +++ b/data/datasets/youtube_subs_howto100M/prepare.py @@ -74,7 +74,7 @@ def main(output_dir: str = "data"): print("Downloading HowTo100M raw_caption.zip...") print(" might take some time(3.4G)...") url = "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/raw_caption.zip" - response = requests.get(url) + response = requests.get(url, timeout=60) zipped = zipfile.ZipFile(io.BytesIO(response.content)) zipped.extractall("./temp") diff --git a/data/datasets/zhihu-kol/main.py b/data/datasets/zhihu-kol/main.py index b508fe1586..d554320124 100644 --- a/data/datasets/zhihu-kol/main.py +++ b/data/datasets/zhihu-kol/main.py @@ -41,7 +41,7 @@ def get_uid_by_url_token(url_token: str) -> str: } url = "https://api.zhihu.com/people/" + url_token - response = requests.get(url, headers=headers) + response = requests.get(url, headers=headers, timeout=60) uid = response.json()["id"] return uid @@ -100,7 +100,7 @@ def get_user_answers(url_token: str, max_count: int = 100000) -> pd.DataFrame: ("offset", f"{offset}"), ) - response = requests.get(url, headers=headers, params=params) + response = requests.get(url, headers=headers, params=params, timeout=60) if response.json().get("paging") is None: return pd.DataFrame(columns=operations.keys()) @@ -148,7 +148,7 @@ def get_answer_content(qid: str, aid) -> str: "Host": "www.zhihu.com", } url = f"https://www.zhihu.com/question/{qid}/answer/{aid}" - response = requests.get(url, headers=headers) + response = requests.get(url, headers=headers, timeout=60) soup = BeautifulSoup(response.text, "html.parser") content = " ".join([p.text.strip() for p in soup.find_all("p")]) diff --git a/data/datasets/zhihu-kol/scrape_by_topic.py b/data/datasets/zhihu-kol/scrape_by_topic.py index 428ab0d5b8..dd8dc685d4 100644 --- a/data/datasets/zhihu-kol/scrape_by_topic.py +++ b/data/datasets/zhihu-kol/scrape_by_topic.py @@ -46,7 +46,7 @@ def get_answer_content(qid: int, aid: int, question_str: str) -> str: "Host": "www.zhihu.com", } url = f"https://www.zhihu.com/question/{qid}/answer/{aid}" - response = requests.get(url, headers=headers) + response = requests.get(url, headers=headers, timeout=60) soup = BeautifulSoup(response.text, "html.parser") content = " ".join([p.text.strip() for p in soup.find_all("p")]) diff --git a/inference/worker/chat_chain_utils.py b/inference/worker/chat_chain_utils.py index 9c20d3b398..602c925333 100644 --- a/inference/worker/chat_chain_utils.py +++ b/inference/worker/chat_chain_utils.py @@ -205,14 +205,14 @@ def run_request(self, params: str, url: str, param_location: str, type: str, pay logger.info( f"Running {type.upper()} request on {url} with\nparams: {params}\nparam_location: {param_location}\npayload: {payload}" ) - res = requests.get(url, params=query_params, headers=headers) + res = requests.get(url, params=query_params, headers=headers, timeout=60) elif type.lower() == "post": # if model did not generate payload object, use params as payload data = json.dumps(payload) if payload else json.dumps(params) logger.info( f"Running {type.upper()} request on {url} with\nparams: {params}\nparam_location: {param_location}\npayload: {data}" ) - res = requests.post(url, params=query_params, data=data, headers=headers) + res = requests.post(url, params=query_params, data=data, headers=headers, timeout=60) else: return f"ERROR! Unsupported request type: {type}. Only GET and POST are supported. Try again!" diff --git a/inference/worker/openapi_parser.py b/inference/worker/openapi_parser.py index 56dc34d6a4..26a9f51eb4 100644 --- a/inference/worker/openapi_parser.py +++ b/inference/worker/openapi_parser.py @@ -8,7 +8,7 @@ def fetch_openapi_spec(url): - response = requests.get(url) + response = requests.get(url, timeout=60) if response.status_code != 200: raise Exception(f"Failed to fetch data from URL: {url}. Status code: {response.status_code}") @@ -29,7 +29,7 @@ def fetch_openapi_spec(url): def get_plugin_config(url: str) -> inference.PluginConfig | None: try: - response = requests.get(url) + response = requests.get(url, timeout=60) response.raise_for_status() plugin_dict = response.json() logger.info(f"Plugin config downloaded {plugin_dict}") diff --git a/inference/worker/utils.py b/inference/worker/utils.py index d5ae793f5c..8a41638d2c 100644 --- a/inference/worker/utils.py +++ b/inference/worker/utils.py @@ -258,11 +258,11 @@ def _maybe_add_bearer_token(self, headers: dict[str, str] | None): def get(self, path: str, **kwargs): kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers")) - return requests.get(self.base_url + path, auth=self.auth, **kwargs) + return requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60) def post(self, path: str, **kwargs): kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers")) - return requests.post(self.base_url + path, auth=self.auth, **kwargs) + return requests.post(self.base_url + path, auth=self.auth, **kwargs, timeout=60) def get_inference_server_stream_events( diff --git a/model/model_training/custom_datasets/prompt_dialogue.py b/model/model_training/custom_datasets/prompt_dialogue.py index 1d30458cb3..b3f41f8376 100644 --- a/model/model_training/custom_datasets/prompt_dialogue.py +++ b/model/model_training/custom_datasets/prompt_dialogue.py @@ -34,7 +34,7 @@ def load_oig_file( # download file if not cached if not local_path.exists() or local_path.stat().st_size == 0 or no_cache: print(f"downloading {source_url} to {local_path}") - r = requests.get(source_url, stream=True) + r = requests.get(source_url, stream=True, timeout=60) with local_path.open(mode="wb") as fd: for chunk in r.iter_content(chunk_size=1024 * 1024): fd.write(chunk) diff --git a/model/model_training/custom_datasets/qa_datasets.py b/model/model_training/custom_datasets/qa_datasets.py index f1be70910f..d7c69cc072 100644 --- a/model/model_training/custom_datasets/qa_datasets.py +++ b/model/model_training/custom_datasets/qa_datasets.py @@ -693,8 +693,8 @@ def __init__(self, cache_dir: str | Path, mode: str = "sft") -> None: data = json.load(f) else: req = requests.get( - "https://raw.githubusercontent.com/teknium1/GPTeacher/main/Roleplay/roleplay-simple-deduped-roleplay-instruct.json" - ) + "https://raw.githubusercontent.com/teknium1/GPTeacher/main/Roleplay/roleplay-simple-deduped-roleplay-instruct.json", + timeout=60) data = json.loads(req.text) os.makedirs(saved_path, exist_ok=True) with open(saved_path / file_name, "w+") as f: diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py index 79072006ce..d5c89e3caa 100644 --- a/scripts/data_augment/data_augment.py +++ b/scripts/data_augment/data_augment.py @@ -144,7 +144,7 @@ def __init__(self, base_url=None, filter_opts=None): ) def get_all_filenames(self): - response = requests.get("https://archive.org/download/stackexchange") + response = requests.get("https://archive.org/download/stackexchange", timeout=60) if response.ok: soup = bs(response.content, "html.parser") table = soup.find("table") @@ -245,7 +245,7 @@ def parse(self, _): xml_posts_path = urls.get(dataset_name) - response = requests.get(xml_posts_path) + response = requests.get(xml_posts_path, timeout=60) df = self.xml_to_df(response) df = self.filter(df) diff --git a/text-frontend/__main__.py b/text-frontend/__main__.py index b3f4d925d0..3e90910be3 100644 --- a/text-frontend/__main__.py +++ b/text-frontend/__main__.py @@ -32,14 +32,14 @@ def main(backend_url: str = "http://127.0.0.1:8080", api_key: str = "1234"): create_user_request = dict(USER) create_user_request["tos_acceptance"] = True response = requests.post( - f"{backend_url}/api/v1/frontend_users/", json=create_user_request, headers={"X-API-Key": api_key} - ) + f"{backend_url}/api/v1/frontend_users/", json=create_user_request, headers={"X-API-Key": api_key}, + timeout=60) response.raise_for_status() user = response.json() typer.echo(f"user: {user}") def _post(path: str, json: dict) -> dict: - response = requests.post(f"{backend_url}{path}", json=json, headers={"X-API-Key": api_key}) + response = requests.post(f"{backend_url}{path}", json=json, headers={"X-API-Key": api_key}, timeout=60) response.raise_for_status() if response.status_code == http.HTTPStatus.NO_CONTENT: return None diff --git a/text-frontend/auto_main.py b/text-frontend/auto_main.py index fc2cb5eb32..101e86c0a5 100644 --- a/text-frontend/auto_main.py +++ b/text-frontend/auto_main.py @@ -30,7 +30,7 @@ def main( """automates tasks""" def _post(path: str, json: dict) -> dict: - response = requests.post(f"{backend_url}{path}", json=json, headers={"X-API-Key": api_key}) + response = requests.post(f"{backend_url}{path}", json=json, headers={"X-API-Key": api_key}, timeout=60) response.raise_for_status() if response.status_code == http.HTTPStatus.NO_CONTENT: return None @@ -58,8 +58,8 @@ def gen_random_ranking(messages): # make sure dummy user has accepted the terms of service create_user_request["tos_acceptance"] = True response = requests.post( - f"{backend_url}/api/v1/frontend_users/", json=create_user_request, headers={"X-API-Key": api_key} - ) + f"{backend_url}/api/v1/frontend_users/", json=create_user_request, headers={"X-API-Key": api_key}, + timeout=60) response.raise_for_status() user = response.json() typer.echo(f"user: {user}") From 0d7e3eed189e864442aad4ea40ac7c95a403f155 Mon Sep 17 00:00:00 2001 From: "pixeebot[bot]" <104101892+pixeebot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 16:54:10 -0400 Subject: [PATCH 2/3] Secure Source of Randomness (#6) Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --- backend/oasst_backend/prompt_repository.py | 6 ++-- backend/oasst_backend/tree_manager.py | 30 +++++++++---------- data/datasets/TSSB-3M/generate_dataset.py | 6 ++-- .../logicreference_OA/generate_dataset.py | 4 +-- data/datasets/mt_note_generation/prepare.py | 4 +-- data/datasets/poetry_instruction/prepare.py | 14 ++++----- .../reasoning_gsm_qna_oa/data_process.py | 8 ++--- .../semantics_ws_qna_oa/data_process.py | 6 ++-- .../soda_synthetic_dialogue/prepare.py | 22 +++++++------- .../tatoeba_mt_qna_oa/data_process.py | 8 ++--- .../youtube_subs_howto100M/prepare.py | 4 +-- inference/tests/locust/locustfile.py | 8 ++--- inference/worker/utils.py | 4 +-- .../model_eval/manual/create_synth_import.py | 6 ++-- model/model_eval/manual/sampling_report.py | 4 +-- model/model_eval/manual/subsample_dataset.py | 6 ++-- .../custom_datasets/dialogue_collator.py | 8 ++--- .../custom_datasets/formatting.py | 6 ++-- .../custom_datasets/instruction.py | 4 +-- .../custom_datasets/qa_datasets.py | 6 ++-- .../custom_datasets/rank_datasets.py | 4 +-- .../custom_datasets/summarization.py | 10 +++---- .../custom_datasets/toxic_conversation.py | 4 +-- .../custom_datasets/translation.py | 26 ++++++++-------- model/model_training/tools/sample_rm_data.py | 4 +-- model/model_training/trainer_rl.py | 4 +-- model/model_training/utils/utils.py | 12 ++++---- model/pretokenizer/pretokenize.py | 4 +-- oasst-data/examples/split_dataset.py | 4 +-- .../oasst_shared/schemas/inference.py | 4 +-- scripts/data_augment/data_augment.py | 14 ++++----- text-frontend/__main__.py | 4 +-- text-frontend/auto_main.py | 22 +++++++------- 33 files changed, 140 insertions(+), 140 deletions(-) diff --git a/backend/oasst_backend/prompt_repository.py b/backend/oasst_backend/prompt_repository.py index a16e2cf373..d55326580c 100644 --- a/backend/oasst_backend/prompt_repository.py +++ b/backend/oasst_backend/prompt_repository.py @@ -1,4 +1,3 @@ -import random import re from collections import defaultdict from datetime import datetime, timedelta @@ -39,6 +38,7 @@ from sqlalchemy.orm import Query from sqlalchemy.orm.attributes import flag_modified from sqlmodel import JSON, Session, and_, func, literal_column, not_, or_, text, update +import secrets _task_type_and_reaction = ( ( @@ -688,9 +688,9 @@ def fetch_random_conversation( if last_message_role: conv_messages = [m for m in messages_tree if m.role == last_message_role] - conv_messages = [random.choice(conv_messages)] + conv_messages = [secrets.choice(conv_messages)] else: - conv_messages = [random.choice(messages_tree)] + conv_messages = [secrets.choice(messages_tree)] messages_tree = {m.id: m for m in messages_tree} while True: diff --git a/backend/oasst_backend/tree_manager.py b/backend/oasst_backend/tree_manager.py index 45fcc69555..7833f56898 100644 --- a/backend/oasst_backend/tree_manager.py +++ b/backend/oasst_backend/tree_manager.py @@ -1,4 +1,3 @@ -import random from datetime import datetime, timedelta from enum import Enum from http import HTTPStatus @@ -37,6 +36,7 @@ from oasst_shared.utils import utcnow from sqlalchemy.sql.functions import coalesce from sqlmodel import Session, and_, func, not_, or_, text, update +import secrets class TaskType(Enum): @@ -302,7 +302,7 @@ def activate_one(db: Session) -> int: weights = [data["reply_ranked_1"] + 1 for data in author_data] # first select an author - prompt_author_id: UUID = random.choices(author_ids, weights=weights)[0] + prompt_author_id: UUID = secrets.SystemRandom().choices(author_ids, weights=weights)[0] logger.info(f"Selected random prompt author {prompt_author_id} among {len(author_data)} candidates.") # select random prompt of author @@ -325,7 +325,7 @@ def activate_one(db: Session) -> int: logger.warning("No prompt candidates of selected author found.") return False - winner_prompt = random.choice(prompt_candidates) + winner_prompt = secrets.choice(prompt_candidates) message: Message = winner_prompt.Message logger.info(f"Prompt lottery winner: {message.id=}") @@ -514,7 +514,7 @@ def next_task( incomplete_rankings = list(filter(lambda m: m.role == "assistant", incomplete_rankings)) if len(incomplete_rankings) > 0: - ranking_parent_id = random.choice(incomplete_rankings).parent_id + ranking_parent_id = secrets.choice(incomplete_rankings).parent_id messages = self.pr.fetch_message_conversation(ranking_parent_id) assert len(messages) > 0 and messages[-1].id == ranking_parent_id @@ -524,7 +524,7 @@ def next_task( replies = self.pr.fetch_message_children(ranking_parent_id, review_result=True, deleted=False) assert len(replies) > 1 - random.shuffle(replies) # hand out replies in random order + secrets.SystemRandom().shuffle(replies) # hand out replies in random order reply_messages = prepare_conversation_message_list(replies) if any(not m.synthetic for m in reply_messages): reveal_synthetic = False @@ -565,7 +565,7 @@ def next_task( replies_need_review = list(filter(lambda m: m.role == "assistant", replies_need_review)) if len(replies_need_review) > 0: - random_reply_message = random.choice(replies_need_review) + random_reply_message = secrets.choice(replies_need_review) messages = self.pr.fetch_message_conversation(random_reply_message) conversation = prepare_conversation(messages) @@ -580,7 +580,7 @@ def next_task( valid_labels = self.cfg.labels_assistant_reply if ( desired_task_type == protocol_schema.TaskRequestType.random - and random.random() > self.cfg.p_full_labeling_review_reply_assistant + and secrets.SystemRandom().random() > self.cfg.p_full_labeling_review_reply_assistant ): label_mode = protocol_schema.LabelTaskMode.simple label_disposition = protocol_schema.LabelTaskDisposition.spam @@ -605,7 +605,7 @@ def next_task( valid_labels = self.cfg.labels_prompter_reply if ( desired_task_type == protocol_schema.TaskRequestType.random - and random.random() > self.cfg.p_full_labeling_review_reply_prompter + and secrets.SystemRandom().random() > self.cfg.p_full_labeling_review_reply_prompter ): label_mode = protocol_schema.LabelTaskMode.simple label_disposition = protocol_schema.LabelTaskDisposition.spam @@ -647,11 +647,11 @@ def next_task( if 0 < p.active_children_count < self.cfg.lonely_children_count and p.parent_role == "prompter" ] - if len(lonely_children_parents) > 0 and random.random() < self.cfg.p_lonely_child_extension: - random_parent = random.choice(lonely_children_parents) + if len(lonely_children_parents) > 0 and secrets.SystemRandom().random() < self.cfg.p_lonely_child_extension: + random_parent = secrets.choice(lonely_children_parents) if random_parent is None: - random_parent = random.choice(extendible_parents) + random_parent = secrets.choice(extendible_parents) # fetch random conversation to extend logger.debug(f"selected {random_parent=}") @@ -672,14 +672,14 @@ def next_task( case TaskType.LABEL_PROMPT: assert len(prompts_need_review) > 0 - message = random.choice(prompts_need_review) + message = secrets.choice(prompts_need_review) message = self.pr.fetch_message(message.id) # re-fetch message including emojis label_mode = protocol_schema.LabelTaskMode.full label_disposition = protocol_schema.LabelTaskDisposition.quality valid_labels = self.cfg.labels_initial_prompt - if random.random() > self.cfg.p_full_labeling_review_prompt: + if secrets.SystemRandom().random() > self.cfg.p_full_labeling_review_prompt: valid_labels = self.cfg.mandatory_labels_initial_prompt.copy() label_mode = protocol_schema.LabelTaskMode.simple label_disposition = protocol_schema.LabelTaskDisposition.spam @@ -839,7 +839,7 @@ def _enter_state(self, mts: MessageTreeState, state: message_tree_state.State): logger.info(f"Tree entered terminal '{mts.state}' state ({mts.message_tree_id=})") root_msg = self.pr.fetch_message(message_id=mts.message_tree_id, fail_if_missing=False) if root_msg and was_active: - if random.random() < self.cfg.p_activate_backlog_tree: + if secrets.SystemRandom().random() < self.cfg.p_activate_backlog_tree: self.activate_backlog_tree(lang=root_msg.lang) if self.cfg.min_active_rankings_per_lang > 0: @@ -1509,7 +1509,7 @@ def _insert_default_state( ) -> MessageTreeState: if goal_tree_size is None: if self.cfg.random_goal_tree_size and self.cfg.min_goal_tree_size < self.cfg.goal_tree_size: - goal_tree_size = random.randint(self.cfg.min_goal_tree_size, self.cfg.goal_tree_size) + goal_tree_size = secrets.SystemRandom().randint(self.cfg.min_goal_tree_size, self.cfg.goal_tree_size) else: goal_tree_size = self.cfg.goal_tree_size return self._insert_tree_state( diff --git a/data/datasets/TSSB-3M/generate_dataset.py b/data/datasets/TSSB-3M/generate_dataset.py index ec785e83bd..1b86944c7e 100644 --- a/data/datasets/TSSB-3M/generate_dataset.py +++ b/data/datasets/TSSB-3M/generate_dataset.py @@ -2,11 +2,11 @@ """ import json -import random import re from os.path import join from tqdm import tqdm +import secrets INSTRUCTIONS_LIST = [ "Find the bug in the following code:", @@ -48,12 +48,12 @@ def gen_instruction(): - idx = random.randint(0, len(INSTRUCTIONS_LIST) - 1) + idx = secrets.SystemRandom().randint(0, len(INSTRUCTIONS_LIST) - 1) return INSTRUCTIONS_LIST[idx] def gen_response_prefix(): - idx = random.randint(0, len(RESPONSE_PREFIX_WORDS) - 1) + idx = secrets.SystemRandom().randint(0, len(RESPONSE_PREFIX_WORDS) - 1) return RESPONSE_PREFIX_WORDS[idx] diff --git a/data/datasets/logicreference_OA/generate_dataset.py b/data/datasets/logicreference_OA/generate_dataset.py index 59ff45ff39..25e229050b 100644 --- a/data/datasets/logicreference_OA/generate_dataset.py +++ b/data/datasets/logicreference_OA/generate_dataset.py @@ -17,12 +17,12 @@ import os -import random import rules import splits import tensorflow as tf from absl import app +import secrets # Generation parameters: # TARGET_FOLDER = "/path/to/generate/dataset/" @@ -74,7 +74,7 @@ def main(_): # Generate each of the splits: print("IID:") - random.seed(RANDOM_SEED) + secrets.SystemRandom().seed(RANDOM_SEED) (train_examples, test_examples) = splits.generate_training_and_test_sets_iid( N_INFERENCE_PROBLEMS, N_VARIATIONS, diff --git a/data/datasets/mt_note_generation/prepare.py b/data/datasets/mt_note_generation/prepare.py index ff5fcdc5a1..cf62c30507 100644 --- a/data/datasets/mt_note_generation/prepare.py +++ b/data/datasets/mt_note_generation/prepare.py @@ -1,13 +1,13 @@ import json import math import os -import random import re import sys from string import punctuation import kaggle import pandas as pd +import secrets CLINICAL_NOTE_GENERATION_TEMPLATE = """User: Write a clinical note about a patient with the following {section}: {section_information}. Rosey: {note}""" @@ -65,7 +65,7 @@ def main(output_dir: str = "data"): kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True) mt_samples = preprocess(pd.read_csv("data/mtsamples.csv")) conversations = get_conversations(mt_samples) - random.shuffle(conversations) + secrets.SystemRandom().shuffle(conversations) train_limit = math.ceil(len(conversations) * 0.6) dev_limit = math.ceil(len(conversations) * 0.8) train, validation, test = ( diff --git a/data/datasets/poetry_instruction/prepare.py b/data/datasets/poetry_instruction/prepare.py index 9a4718e2da..502d20154f 100644 --- a/data/datasets/poetry_instruction/prepare.py +++ b/data/datasets/poetry_instruction/prepare.py @@ -1,9 +1,9 @@ import json import os -import random import kaggle import pandas as pd +import secrets # Authenticate the Kaggle API client kaggle.api.authenticate() @@ -116,15 +116,15 @@ author = row["Poet"] # Variables to store to instruction, reply, source, and metadata. - instruction = random.choice(writing_prompts_topic).replace("$topic", str(topics)) - reply = random.choice(replies_topic).replace("$topic", str(topics)).replace("$title", title).replace("$poem", poem) + instruction = secrets.choice(writing_prompts_topic).replace("$topic", str(topics)) + reply = secrets.choice(replies_topic).replace("$topic", str(topics)).replace("$title", title).replace("$poem", poem) source = "PoetryFoundation.org" + " - " + author metadata = {"author": author, "title": title, "tags": str(topics), "task_type": "writing"} # If the entry has an empty value for the topic, use the non-topic prompts and replies. if pd.isna(topics): - instruction = random.choice(writing_prompts_notTopic) - reply = random.choice(replies_notTopic).replace("$title", title).replace("$poem", poem) + instruction = secrets.choice(writing_prompts_notTopic) + reply = secrets.choice(replies_notTopic).replace("$title", title).replace("$poem", poem) # Create a dictionary entry for the entry and append it to the list. entry = {"INSTRUCTION": instruction, "RESPONSE": reply, "SOURCE": source, "METADATA": json.dumps(metadata)} @@ -139,8 +139,8 @@ author = row["Poet"] # Variables to store to instruction, reply, source, and metadata. - instruction = random.choice(titling_prompts).replace("$poem", poem) - reply = random.choice(titling_replies).replace("$title", title) + instruction = secrets.choice(titling_prompts).replace("$poem", poem) + reply = secrets.choice(titling_replies).replace("$title", title) source = "PoetryFoundation.org" + " - " + author metadata = {"author": author, "title": title, "tags": str(topics), "task_type": "titling"} diff --git a/data/datasets/reasoning_gsm_qna_oa/data_process.py b/data/datasets/reasoning_gsm_qna_oa/data_process.py index 0aa4c99471..948e2b7682 100644 --- a/data/datasets/reasoning_gsm_qna_oa/data_process.py +++ b/data/datasets/reasoning_gsm_qna_oa/data_process.py @@ -1,12 +1,12 @@ import json -import random import re from dataclasses import dataclass import pandas as pd from datasets import load_dataset +import secrets -random.seed(42) +secrets.SystemRandom().seed(42) random_list_python = [ "Make a python code.", @@ -29,9 +29,9 @@ def qna_wrapper(source, random_list_python, random_list_answer): def create_qna(row): - instruction = row["question"] if source == "gsm8k" else row["input"] + " " + random.choice(random_list_python) + instruction = row["question"] if source == "gsm8k" else row["input"] + " " + secrets.choice(random_list_python) response = ( - re.sub(r"(<<[\d\.\-\+\*=/\\]+>>)", "", row["answer"].replace("####", random.choice(random_list_answer))) + re.sub(r"(<<[\d\.\-\+\*=/\\]+>>)", "", row["answer"].replace("####", secrets.choice(random_list_answer))) + "." if source == "gsm8k" else row["code"] diff --git a/data/datasets/semantics_ws_qna_oa/data_process.py b/data/datasets/semantics_ws_qna_oa/data_process.py index ec70bb2d7e..49b584cc61 100644 --- a/data/datasets/semantics_ws_qna_oa/data_process.py +++ b/data/datasets/semantics_ws_qna_oa/data_process.py @@ -1,19 +1,19 @@ import json -import random from dataclasses import dataclass import pandas as pd import random_stuff from datasets import load_dataset +import secrets -random.seed(42) +secrets.SystemRandom().seed(42) # format to QnA def qna_wrapper(): def create_qna(row): # make a random number - random_num = random.randint(0, 2) + random_num = secrets.SystemRandom().randint(0, 2) # extract rows' vals lang = row["Language"] diff --git a/data/datasets/soda_synthetic_dialogue/prepare.py b/data/datasets/soda_synthetic_dialogue/prepare.py index dceb6ea1d1..70d5341ec7 100644 --- a/data/datasets/soda_synthetic_dialogue/prepare.py +++ b/data/datasets/soda_synthetic_dialogue/prepare.py @@ -2,11 +2,11 @@ import json import os -import random import sys from datasets import load_dataset from tqdm import tqdm +import secrets # adapted from https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing @@ -77,7 +77,7 @@ def main(output_dir: str = "data"): """Download and prepare the dataset for use.""" - random.seed(42) + secrets.SystemRandom().seed(42) dataset = load_dataset("allenai/soda") os.makedirs(output_dir, exist_ok=True) @@ -103,7 +103,7 @@ def main(output_dir: str = "data"): dialogue = [s2 + ": " + s1 for s1, s2 in zip(dat["dialogue"], dat["speakers"])] - if random.randint(0, 6) == 0: + if secrets.SystemRandom().randint(0, 6) == 0: # print("##") # print(f"User: Can you give me a short story description for this dialog?") # print(" " + "\n ".join(dialog)) @@ -116,7 +116,7 @@ def main(output_dir: str = "data"): conversation = SUMMARY_TEMPLATE.format(dialogue="\n ".join(dialogue), story=story, title=title) if theme: conversation = conversation + THEME_TEMPLATE.format(theme=theme) - elif random.randint(0, 6) == 0: + elif secrets.SystemRandom().randint(0, 6) == 0: # print("##") # print(f"User: Can you write a short dialog based on this story:\n {story}") # print(f"Assistant: Sure, a dialog for this story could be:") @@ -131,7 +131,7 @@ def main(output_dir: str = "data"): ) if theme: conversation = conversation + THEME_TEMPLATE.format(theme=theme) - elif random.randint(0, 3) == 0: + elif secrets.SystemRandom().randint(0, 3) == 0: # print("##") # print(f"User: Can you write the next few lines of dialog for this scene:") # if random.randint(0, 1) == 0: @@ -153,9 +153,9 @@ def main(output_dir: str = "data"): # if theme: # print("User: What would be one theme of this story?") # print(f'Assistant: One theme of this story could be: "{theme}"') - if random.randint(0, 1) == 0: + if secrets.SystemRandom().randint(0, 1) == 0: depth = -5 - elif random.randint(0, 1) == 0: + elif secrets.SystemRandom().randint(0, 1) == 0: depth = -3 else: depth = -4 @@ -167,7 +167,7 @@ def main(output_dir: str = "data"): ) if theme: conversation = conversation + THEME_TEMPLATE.format(theme=theme) - elif random.randint(0, 3) == 0: + elif secrets.SystemRandom().randint(0, 3) == 0: # print("##") # title1 = title.split(".")[0] # title2 = title.split(".")[1] @@ -197,18 +197,18 @@ def main(output_dir: str = "data"): title1 = title.split(".")[0] title2 = title.split(".")[1] conversation = NEW_STORY_AND_DIALOGUE_TEMPLATE.format(title1=title1, story=story) - if random.randint(0, 1) == 0: + if secrets.SystemRandom().randint(0, 1) == 0: conversation = FULL_DIALOGUE_TEMPLATE.format( conversation=conversation, dialogue="\n ".join(dialogue) ) - elif random.randint(0, 1) == 0 and len(dialogue) > 5: + elif secrets.SystemRandom().randint(0, 1) == 0 and len(dialogue) > 5: conversation = MORE_DIALOGUE_TEMPLATE.format( conversation=conversation, dialogue1="\n ".join(dialogue[:-5]), title2=title2, dialogue2="\n ".join(dialogue[-5:]), ) - elif random.randint(0, 1) == 0: + elif secrets.SystemRandom().randint(0, 1) == 0: conversation = NEXT_DIALOGUE_TEMPLATE.format( conversation=conversation, dialogue1="\n ".join(dialogue[:-3]), diff --git a/data/datasets/tatoeba_mt_qna_oa/data_process.py b/data/datasets/tatoeba_mt_qna_oa/data_process.py index bfaadf20da..61f4fe641c 100644 --- a/data/datasets/tatoeba_mt_qna_oa/data_process.py +++ b/data/datasets/tatoeba_mt_qna_oa/data_process.py @@ -1,5 +1,4 @@ import json -import random import uuid from dataclasses import dataclass @@ -9,8 +8,9 @@ import language_paraphrase import language_translate import pandas as pd +import secrets -random.seed(42) +secrets.SystemRandom().seed(42) class DataProcess: @@ -24,8 +24,8 @@ def randomize_text(self, text, original_lang=None, target_lang=None): if not ((original_lang == target_lang) and (original_lang is not None) and (target_lang is not None)) else language_paraphrase.random_templates_paraphrase.get(original_lang, {}) ) - template = random.choice(list(templates.values())) - quote_pair = random.choice(DataProcess().random_quote) + template = secrets.choice(list(templates.values())) + quote_pair = secrets.choice(DataProcess().random_quote) opening_quote, closing_quote = quote_pair original_lang_name = DataProcess.language_name(None, original_lang, original_lang) target_lang_name = DataProcess.language_name(None, target_lang, original_lang) diff --git a/data/datasets/youtube_subs_howto100M/prepare.py b/data/datasets/youtube_subs_howto100M/prepare.py index 78faee1a36..6899841b7b 100644 --- a/data/datasets/youtube_subs_howto100M/prepare.py +++ b/data/datasets/youtube_subs_howto100M/prepare.py @@ -3,7 +3,6 @@ import math import os import pickle -import random import re import sys import urllib @@ -13,6 +12,7 @@ import requests from tqdm import tqdm from youtube_transcript_api import YouTubeTranscriptApi +import secrets def get_video_ids(raw_file: str, video_id_pattern: str) -> List[str]: @@ -103,7 +103,7 @@ def main(output_dir: str = "data"): print(f"Total {len(dataset)} pairs extracted.") print("Splitting and saving data...") - random.shuffle(dataset) + secrets.SystemRandom().shuffle(dataset) train_limit = math.ceil(len(dataset) * 0.6) # TODO: parameterize ratios dev_limit = math.ceil(len(dataset) * 0.8) train, validation, test = ( diff --git a/inference/tests/locust/locustfile.py b/inference/tests/locust/locustfile.py index 87ae0b1e78..4e25e52022 100644 --- a/inference/tests/locust/locustfile.py +++ b/inference/tests/locust/locustfile.py @@ -1,10 +1,10 @@ -import random import string import sys import time from pathlib import Path from locust import HttpUser, between, task +import secrets sys.path.append(str(Path(__file__).parent.parent.parent / "text-client")) import text_client_utils as utils # noqa: E402 @@ -12,15 +12,15 @@ class ChatUser(HttpUser): wait_time = between(1, 2) - conversation_length = random.randint(3, 20) - time_to_respond = random.randint(3, 5) # for the user + conversation_length = secrets.SystemRandom().randint(3, 20) + time_to_respond = secrets.SystemRandom().randint(3, 5) # for the user # model_config_name = "distilgpt2" model_config_name = "_lorem" @task def chat(self): client = utils.DebugClient(backend_url="", http_client=self.client) - username = "".join(random.choice(string.ascii_lowercase) for _ in range(20)) + username = "".join(secrets.choice(string.ascii_lowercase) for _ in range(20)) client.login(username) client.create_chat() diff --git a/inference/worker/utils.py b/inference/worker/utils.py index 8a41638d2c..2add1a6d94 100644 --- a/inference/worker/utils.py +++ b/inference/worker/utils.py @@ -1,5 +1,4 @@ import collections -import random import threading import time from typing import Iterable, Literal @@ -14,6 +13,7 @@ from loguru import logger from oasst_shared.schemas import inference from settings import settings +import secrets shared_tokenizer_lock = threading.Lock() @@ -176,7 +176,7 @@ def wait_for_inference_server(http: "HttpClient", timeout: int = 600): except (requests.HTTPError, requests.ConnectionError): if time.time() > time_limit: raise - sleep_duration = random.uniform(0, 10) + sleep_duration = secrets.SystemRandom().uniform(0, 10) logger.warning(f"Inference server not ready. Retrying in {sleep_duration:.2f} seconds") time.sleep(sleep_duration) else: diff --git a/model/model_eval/manual/create_synth_import.py b/model/model_eval/manual/create_synth_import.py index 5f5760765d..3c4411c0e2 100644 --- a/model/model_eval/manual/create_synth_import.py +++ b/model/model_eval/manual/create_synth_import.py @@ -1,6 +1,5 @@ import argparse import json -import random import re import sys from uuid import uuid4 @@ -8,6 +7,7 @@ import pydantic from oasst_data import ExportMessageNode, ExportMessageTree from sampling_report import SamplingReport +import secrets def filter_text(s: str) -> str: @@ -70,7 +70,7 @@ def main(): else: reply_by_prompt[p.prompt] = [m] - random.seed(args.seed) + secrets.SystemRandom().seed(args.seed) trees: list[ExportMessageTree] = [] for k, v in reply_by_prompt.items(): # remove exact duplicates @@ -89,7 +89,7 @@ def main(): prompt_message = ExportMessageNode( message_id=str(uuid4()), text=k, role="prompter", synthetic=False, lang=args.lang ) - prompt_message.replies = random.sample(unique_replies, k=min(args.num_replies, len(unique_replies))) + prompt_message.replies = secrets.SystemRandom().sample(unique_replies, k=min(args.num_replies, len(unique_replies))) t = ExportMessageTree(message_tree_id=prompt_message.message_id, tree_state="ranking", prompt=prompt_message) trees.append(t) if args.max_count and len(trees) >= args.max_count: diff --git a/model/model_eval/manual/sampling_report.py b/model/model_eval/manual/sampling_report.py index 1201389c85..191027d526 100644 --- a/model/model_eval/manual/sampling_report.py +++ b/model/model_eval/manual/sampling_report.py @@ -1,7 +1,6 @@ import argparse import gzip import json -import random import re from collections import OrderedDict from datetime import datetime @@ -13,6 +12,7 @@ from model_training.models.peft_modeling import load_peft_model from tqdm import tqdm from transformers import AutoTokenizer, PreTrainedTokenizer +import secrets QA_SPECIAL_TOKENS = {"Question": "", "Answer": "", "StartPrefix": "", "EndPrefix": ""} QA_SPECIAL_TOKENS_V2_5 = { @@ -286,7 +286,7 @@ def main(): print("Device:", device) if args.seed: - random.seed(args.seed) + secrets.SystemRandom().seed(args.seed) torch.manual_seed(args.seed) # load configuration diff --git a/model/model_eval/manual/subsample_dataset.py b/model/model_eval/manual/subsample_dataset.py index 0459834a07..05293f13a7 100644 --- a/model/model_eval/manual/subsample_dataset.py +++ b/model/model_eval/manual/subsample_dataset.py @@ -1,12 +1,12 @@ import argparse import gzip import json -import random from pathlib import Path from typing import Optional import pydantic from oasst_data import ExportMessageTree +import secrets def load_message_trees( @@ -101,7 +101,7 @@ def parse_args(): def main(): args = parse_args() lang_codes = args.lang.split(",") - random.seed(args.seed) + secrets.SystemRandom().seed(args.seed) trees = load_message_trees( args.input_file, lang_codes=lang_codes, @@ -111,7 +111,7 @@ def main(): print(f"Matching messages trees: {len(trees)}") assert len(trees) > args.k, f"Not enough trees ({len(trees)} found, {args.k} required)" - sub_sample = random.sample(trees, k=args.k) + sub_sample = secrets.SystemRandom().sample(trees, k=args.k) if args.only_prompts: sub_sample = [x.prompt for x in sub_sample] diff --git a/model/model_training/custom_datasets/dialogue_collator.py b/model/model_training/custom_datasets/dialogue_collator.py index f0125768ae..2ba48e054c 100644 --- a/model/model_training/custom_datasets/dialogue_collator.py +++ b/model/model_training/custom_datasets/dialogue_collator.py @@ -1,4 +1,3 @@ -import random from dataclasses import dataclass from typing import Optional, Union @@ -13,6 +12,7 @@ ) from torch.nn import functional as F from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase, TruncationStrategy +import secrets @dataclass @@ -50,7 +50,7 @@ def __post_init__(self): def process_one(self, messages, return_length=False): total_short_context_one = 0 - if random.random() < self.random_offset_probability and not isinstance(messages, DatasetEntryLm): + if secrets.SystemRandom().random() < self.random_offset_probability and not isinstance(messages, DatasetEntryLm): truncation = TruncationStrategy.DO_NOT_TRUNCATE max_length = None else: @@ -116,7 +116,7 @@ def process_one(self, messages, return_length=False): input_length = len(flatten_message.input_ids) if self.max_length and input_length > self.max_length: - offset = random.randint(0, input_length - self.max_length) + offset = secrets.SystemRandom().randint(0, input_length - self.max_length) for k in flatten_message.keys(): v = flatten_message[k] if isinstance(v, list) and len(v) == input_length: @@ -151,7 +151,7 @@ def __call__(self, features): _flatten_messages, _label_masks = [], [] prev_short_msg, prev_short_mask = None, None for flatten_msg, label_mask in zip(flatten_messages, label_masks): - if len(flatten_msg.input_ids) < self.mix_length_threshold and random.random() > self.mix_probability: + if len(flatten_msg.input_ids) < self.mix_length_threshold and secrets.SystemRandom().random() > self.mix_probability: if prev_short_msg is not None: for key in flatten_msg.keys(): flatten_msg[key] += prev_short_msg[key] diff --git a/model/model_training/custom_datasets/formatting.py b/model/model_training/custom_datasets/formatting.py index a5bfdc7394..5962ba3443 100644 --- a/model/model_training/custom_datasets/formatting.py +++ b/model/model_training/custom_datasets/formatting.py @@ -1,11 +1,11 @@ import re from enum import Enum from itertools import zip_longest -from random import random, shuffle from typing import Literal, Optional from pydantic import BaseModel, validator from pydantic.fields import ModelField +import secrets QA_SPECIAL_TOKENS = { "Question": "<|prompter|>", @@ -72,7 +72,7 @@ def system_tag( if add_length: properties.append(("length", compute_length(self.text))) - shuffle(properties) + secrets.SystemRandom().shuffle(properties) # ensure that potentially multi-line conext field comes last if self.context: @@ -80,7 +80,7 @@ def system_tag( fragments: list[str] = [] for k, v in properties: - if random() < property_dropout: + if secrets.SystemRandom().random() < property_dropout: continue if isinstance(v, float): diff --git a/model/model_training/custom_datasets/instruction.py b/model/model_training/custom_datasets/instruction.py index 37c4026bb3..9fb2d4dd0c 100644 --- a/model/model_training/custom_datasets/instruction.py +++ b/model/model_training/custom_datasets/instruction.py @@ -1,13 +1,13 @@ """ These are in the form of 'INSTRUCTION', 'RESPONSE' """ -import random from typing import Optional from datasets import load_dataset from model_training.custom_datasets.formatting import DatasetEntry, create_dataset_entry_qa from model_training.custom_datasets.utils import _filter_by_words from torch.utils.data import Dataset +import secrets INSTRUCTION_DATASETS = { # Note humaneval_mbpp_codegen_qa returns a code string that we would want to at least wrap in ``` marks` @@ -104,7 +104,7 @@ def __init__( questions, answers = [], [] item_len = 0 - rng = random.Random(seed) + rng = secrets.SystemRandom().Random(seed) order = list(range(len(ds))) rng.shuffle(order) diff --git a/model/model_training/custom_datasets/qa_datasets.py b/model/model_training/custom_datasets/qa_datasets.py index d7c69cc072..ede7e7b804 100644 --- a/model/model_training/custom_datasets/qa_datasets.py +++ b/model/model_training/custom_datasets/qa_datasets.py @@ -4,7 +4,6 @@ import glob import json import os -import random import re from collections import defaultdict from pathlib import Path @@ -18,6 +17,7 @@ from model_training.custom_datasets.utils import _filter_by_words from torch import Generator from torch.utils.data import Dataset, Subset, random_split +import secrets # @agoryuno contributed this re_reference_remove = re.compile(r"\[\d+(?:,\s*\d+)*?\]") @@ -485,7 +485,7 @@ def process_vicuna_conversations( return None elif speaker == "human": # replace empty messages with one of the following - message = random.choice(["...", "Please continue", "Go on", ""]) + message = secrets.choice(["...", "Please continue", "Go on", ""]) # remove markdown escaping in revision 192ab2185289094fc556ec8ce5ce1e8e587154ca # python-markdownify with escape_asterisks & escape_underscores True is used # for pre-processing the dataset. @@ -664,7 +664,7 @@ def _process_instruction(self, row: dict[str, str]) -> DatasetEntry | None: ) # Concatenate the instruction and input. else: - linking_char = random.choice(LINKING_CHARS) + linking_char = secrets.choice(LINKING_CHARS) return create_dataset_entry_qa( mode=self.mode, questions=[f"{row['instruction']}{linking_char}{row['input']}"], diff --git a/model/model_training/custom_datasets/rank_datasets.py b/model/model_training/custom_datasets/rank_datasets.py index 01d68999e7..ca13037a91 100644 --- a/model/model_training/custom_datasets/rank_datasets.py +++ b/model/model_training/custom_datasets/rank_datasets.py @@ -1,10 +1,10 @@ -import random from collections import defaultdict from typing import List import numpy as np from datasets import load_dataset from torch.utils.data import Dataset +import secrets SEED = 2020 @@ -173,7 +173,7 @@ def __getitem__(self, idx): # we want to prevent modifying user_answer_ranks rank = user_answer_ranks if len(bad_samples) > 0: - additional = random.choice(bad_samples) + additional = secrets.choice(bad_samples) rank = user_answer_ranks + [additional] return prefixes, rank diff --git a/model/model_training/custom_datasets/summarization.py b/model/model_training/custom_datasets/summarization.py index d9a7efdfe0..be3d9da697 100644 --- a/model/model_training/custom_datasets/summarization.py +++ b/model/model_training/custom_datasets/summarization.py @@ -1,11 +1,11 @@ """ Summarize different spectrum of documents """ -import random import numpy as np from datasets import load_dataset from torch.utils.data import Dataset +import secrets SUMMARIZATION_SPECIAL_TOKENS = {"Text": "", "Summary": ["TL;DR:", "Summarize this", "Give me the summary"]} @@ -72,9 +72,9 @@ def __getitem__(self, idx): text, summary = data[self.text_column], data[self.summary_column] text, summary = self.preprocess_fn(text, summary) if self.name in SUMMARY_SPECIAL_PROMPT: - prompt = random.choice(SUMMARIZATION_SPECIAL_TOKENS["Summary"]) + prompt = secrets.choice(SUMMARIZATION_SPECIAL_TOKENS["Summary"]) else: - prompt = random.choice(SUMMARIZATION_SPECIAL_TOKENS["Summary"]) + prompt = secrets.choice(SUMMARIZATION_SPECIAL_TOKENS["Summary"]) context = "".join([SUMMARIZATION_SPECIAL_TOKENS["Text"], " ".join(text.split(" ")[: self.max_words]), prompt]) return (context, summary) @@ -148,7 +148,7 @@ def __getitem__(self, index: int) -> tuple | list: context = self.posts[index] # return pairs of comparison good_summary, bad_summary = self.summary_pairs[index] - prompt = random.choice(SUMMARIZATION_PROMPTS) + prompt = secrets.choice(SUMMARIZATION_PROMPTS) # pair very big # we are going to do some sampling @@ -251,7 +251,7 @@ def __getitem__(self, index) -> tuple | list: context = self.index2summary[index] # return pairs of comparison rows = self.summaries[context] - prompt = random.choice(SUMMARIZATION_PROMPTS) + prompt = secrets.choice(SUMMARIZATION_PROMPTS) # pair very big # we are going to do some sampling diff --git a/model/model_training/custom_datasets/toxic_conversation.py b/model/model_training/custom_datasets/toxic_conversation.py index 61ddad9233..5f0e7500bc 100644 --- a/model/model_training/custom_datasets/toxic_conversation.py +++ b/model/model_training/custom_datasets/toxic_conversation.py @@ -2,10 +2,10 @@ SFT dataset to reject toxic questions """ -import random from datasets import load_dataset from torch.utils.data import Dataset +import secrets class ProsocialDialogueExplaination(Dataset): @@ -29,7 +29,7 @@ def __init__(self, split="train", cache_dir=".cache") -> None: self.pairs = [] for row in dataset: for safety_annotation, safe_answer in zip(row["safety_annotations"], row["safety_annotation_reasons"]): - (prompt_template, answer_template) = random.choice(self.TEMPLATE) + (prompt_template, answer_template) = secrets.choice(self.TEMPLATE) self.pairs.append( ( prompt_template.format(row["context"], safety_annotation), diff --git a/model/model_training/custom_datasets/translation.py b/model/model_training/custom_datasets/translation.py index 960ec13133..f90f6c6429 100644 --- a/model/model_training/custom_datasets/translation.py +++ b/model/model_training/custom_datasets/translation.py @@ -6,10 +6,10 @@ fill in the blanks : https://huggingface.co/datasets/m_lama """ -import random from datasets import load_dataset from torch.utils.data import Dataset +import secrets # postfix prompt TRANSLATION_PROMPT = { @@ -97,10 +97,10 @@ def __len__(self): return len(self.pairs) def __getitem__(self, index): - if random.random() < self.mix_prob and index > 5 and index < (self.length - 5): - additional = random.randint(0, 10) - 5 + if secrets.SystemRandom().random() < self.mix_prob and index > 5 and index < (self.length - 5): + additional = secrets.SystemRandom().randint(0, 10) - 5 while additional == index: - additional = random.randint(0, 10) - 5 + additional = secrets.SystemRandom().randint(0, 10) - 5 history_text = self.pairs[additional + index] question, answer = self.pairs[index] @@ -117,11 +117,11 @@ def __init__(self, pair="zh-en", split="train", mix_prob=0.2, maximum_size=10000 src, tgt = pair.split("-") for row in dataset: row = row["translation"] - if random.random() > 0.5: - source = random.choice(TRANSLATION_PROMPT[tgt]).format(row[src]) + if secrets.SystemRandom().random() > 0.5: + source = secrets.choice(TRANSLATION_PROMPT[tgt]).format(row[src]) self.pairs.append((source, row[tgt])) else: # translating in reverse direction - source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) + source = secrets.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) self.pairs.append((source, row[src])) # WMT is very large, reduce preprocessing time if len(self.pairs) > maximum_size: @@ -142,12 +142,12 @@ def __init__(self, split="train", mix_prob=0.2) -> None: if lang_code not in TRANSLATION_PROMPT: continue - if random.random() > 0.5: - source = random.choice(TRANSLATION_PROMPT[lang_code]).format(row[src]) + if secrets.SystemRandom().random() > 0.5: + source = secrets.choice(TRANSLATION_PROMPT[lang_code]).format(row[src]) self.pairs.append((source, row[tgt])) else: # translating in reverse direction lang_code = "en" - source = random.choice(TRANSLATION_PROMPT[lang_code]).format(row[tgt]) + source = secrets.choice(TRANSLATION_PROMPT[lang_code]).format(row[tgt]) self.pairs.append((source, row[src])) @@ -160,11 +160,11 @@ def __init__(self, pair="de-ja", split="train", year="2016", mix_prob=0.2, maxim src, tgt = pair.split("-") for row in dataset: row = row["translation"] - if random.random() > 0.5: - source = random.choice(TRANSLATION_PROMPT[tgt]).format(row[src]) + if secrets.SystemRandom().random() > 0.5: + source = secrets.choice(TRANSLATION_PROMPT[tgt]).format(row[src]) self.pairs.append((source, row[tgt])) else: # translating in reverse direction - source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) + source = secrets.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) self.pairs.append((source, row[src])) # WMT is very large if len(self.pairs) > maximum_size: diff --git a/model/model_training/tools/sample_rm_data.py b/model/model_training/tools/sample_rm_data.py index 48674c48b0..1c339cda96 100644 --- a/model/model_training/tools/sample_rm_data.py +++ b/model/model_training/tools/sample_rm_data.py @@ -7,12 +7,12 @@ """ import glob import json -import random import sys from collections import defaultdict from copy import deepcopy from fastlangid.langid import LID +import secrets langid = LID() total_ranks = [] @@ -173,7 +173,7 @@ def process_context(convo): for row in new_dataset: f.write(json.dumps(row) + "\n") - random.shuffle(RM_dataset) + secrets.SystemRandom().shuffle(RM_dataset) train_flag = int(len(RM_dataset) * 0.8) test_flag = int(len(RM_dataset) * 0.9) train, test, val = RM_dataset[:train_flag], RM_dataset[train_flag:test_flag], RM_dataset[test_flag:] diff --git a/model/model_training/trainer_rl.py b/model/model_training/trainer_rl.py index f8756c3152..1859c861b9 100644 --- a/model/model_training/trainer_rl.py +++ b/model/model_training/trainer_rl.py @@ -1,7 +1,6 @@ import argparse import math import os -import random from argparse import Namespace from typing import Sequence @@ -20,6 +19,7 @@ from utils.ppo_utils import CustomPPOTrainer from utils.utils import _strtobool, get_dataset, get_model, init_rng, read_yamls from utils.utils_rl import prepare_tensor +import secrets def argument_parsing(notebook: bool = False, notebook_args: Sequence[str] | None = None, **kwargs): @@ -160,7 +160,7 @@ def main(): if training_conf.num_eval_prompts is not None and training_conf.num_eval_prompts > 0: eval_prompts = eval_prompts[: training_conf.num_eval_prompts] - random.shuffle(prompts) + secrets.SystemRandom().shuffle(prompts) # Sanity Check for prompts to make sure it's loading properly with open(r"output.txt", "w") as fp: for item in eval_prompts: diff --git a/model/model_training/utils/utils.py b/model/model_training/utils/utils.py index 0397822d3c..76654f6e24 100644 --- a/model/model_training/utils/utils.py +++ b/model/model_training/utils/utils.py @@ -1,7 +1,6 @@ import argparse import copy import math -import random from distutils.util import strtobool from pathlib import Path from typing import List, NamedTuple @@ -22,6 +21,7 @@ from torch.utils.data.distributed import DistributedSampler from .losses import CrossEntropyLoss, PolyLoss, RMCLSLoss, RMLoss +import secrets def _strtobool(x): @@ -100,26 +100,26 @@ def __iter__(self): epoch_idx = [] n = 0 - random.seed(self.epoch + self.seed) + secrets.SystemRandom().seed(self.epoch + self.seed) for i in range(self.num_datasets): - sampled_idx = random.sample(range(n, self.dataset_sizes[i] + n), self.dataset_size_per_epoch[i]) + sampled_idx = secrets.SystemRandom().sample(range(n, self.dataset_sizes[i] + n), self.dataset_size_per_epoch[i]) n += self.dataset_sizes[i] epoch_idx.extend(sampled_idx) if self.samples_length is not None: # sort by samples length and in case of ties randomize - epoch_idx = sorted(epoch_idx, key=lambda x: (self.samples_length[x], random.random())) + epoch_idx = sorted(epoch_idx, key=lambda x: (self.samples_length[x], secrets.SystemRandom().random())) if self.shuffle: # do some minor shuffling to avoid repeating the same order # but not too much to avoid too much padding # quasi random basically for i in range(0, len(epoch_idx), 200): # this should be batch_size dependent - random.shuffle(epoch_idx[i : i + 200]) + secrets.SystemRandom().shuffle(epoch_idx[i : i + 200]) else: if self.shuffle: - random.shuffle(epoch_idx) + secrets.SystemRandom().shuffle(epoch_idx) # split epoch_idx in world_size chunks epoch_idx = epoch_idx[self.rank : self.num_samples : self.world_size] diff --git a/model/pretokenizer/pretokenize.py b/model/pretokenizer/pretokenize.py index 184ec515e6..283f9f00d9 100644 --- a/model/pretokenizer/pretokenize.py +++ b/model/pretokenizer/pretokenize.py @@ -1,6 +1,5 @@ import argparse import json -import random from enum import IntEnum from pathlib import Path from subprocess import run @@ -13,6 +12,7 @@ from tokenizer import build_tokenizer from torch.utils.data import ConcatDataset, Dataset, Subset from tqdm import tqdm +import secrets class IntRole(IntEnum): @@ -375,7 +375,7 @@ def main(): print(f"{k}: {v}") # initialize random states for reproducibility - random.seed(args.rng_seed) + secrets.SystemRandom().seed(args.rng_seed) np.random.seed(args.rng_seed) torch.manual_seed(args.rng_seed) diff --git a/oasst-data/examples/split_dataset.py b/oasst-data/examples/split_dataset.py index 0a47a7ca0c..687f90580e 100644 --- a/oasst-data/examples/split_dataset.py +++ b/oasst-data/examples/split_dataset.py @@ -1,7 +1,7 @@ import argparse -import random from oasst_data import read_message_list, write_messages +import secrets def parse_args(): @@ -43,7 +43,7 @@ def main(): print(f"Found {len(messages)} matching messages.") tree_ids = list(set(m.message_tree_id for m in messages)) - random.shuffle(tree_ids) + secrets.SystemRandom().shuffle(tree_ids) val_size = len(tree_ids) * args.val_percent // 100 diff --git a/oasst-shared/oasst_shared/schemas/inference.py b/oasst-shared/oasst_shared/schemas/inference.py index c4110285de..43865d2249 100644 --- a/oasst-shared/oasst_shared/schemas/inference.py +++ b/oasst-shared/oasst_shared/schemas/inference.py @@ -1,6 +1,5 @@ import enum import platform -import random import uuid from datetime import datetime from typing import Annotated, Literal, Union @@ -9,6 +8,7 @@ import pydantic import pynvml from oasst_shared.model_configs import ModelConfig +import secrets INFERENCE_PROTOCOL_VERSION = "1" @@ -195,7 +195,7 @@ class PluginUsed(pydantic.BaseModel): def make_seed() -> int: - return random.randint(0, 0xFFFF_FFFF_FFFF_FFFF - 1) + return secrets.SystemRandom().randint(0, 0xFFFF_FFFF_FFFF_FFFF - 1) class WorkParameters(pydantic.BaseModel): diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py index d5c89e3caa..6ba8213153 100644 --- a/scripts/data_augment/data_augment.py +++ b/scripts/data_augment/data_augment.py @@ -12,7 +12,6 @@ import argparse import json -import random import string from collections import Counter @@ -26,6 +25,7 @@ from nltk.corpus import wordnet from syntax.syntax_injector import SyntaxBug from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, pipeline +import secrets class DataAugmenter: @@ -97,8 +97,8 @@ def parse_single(self, essay): # Make structure error (shuffle one paragraph with another) essay_paragraphs = essay.split("\n\n") # Splitting a String by newline character (\n) - rand1 = random.randint(0, len(essay_paragraphs) - 1) - rand2 = random.randint(0, len(essay_paragraphs) - 1) + rand1 = secrets.SystemRandom().randint(0, len(essay_paragraphs) - 1) + rand2 = secrets.SystemRandom().randint(0, len(essay_paragraphs) - 1) temp = essay_paragraphs[rand1] essay_paragraphs[rand1] = essay_paragraphs[rand2] @@ -110,13 +110,13 @@ def parse_single(self, essay): essay_words = essay.split() for i in range(len(essay_words)): - if random.randint(0, 100) < 30: + if secrets.SystemRandom().randint(0, 100) < 30: suggestion = [] for syn in wordnet.synsets(essay_words[i]): for l in syn.lemmas(): suggestion.append(l.name()) if suggestion != []: - essay_words[i] = suggestion[random.randint(0, len(suggestion) - 1)] + essay_words[i] = suggestion[secrets.SystemRandom().randint(0, len(suggestion) - 1)] corrupted_essay = " ".join(essay_words) @@ -124,8 +124,8 @@ def parse_single(self, essay): # you can change the number 60 to change how much corrupted this essay will be for _ in range(len(essay) // 60): - rand = random.randint(0, len(essay)) - corrupted_essay = essay[:rand] + random.choice(string.ascii_letters) + essay[rand + 1 :] + rand = secrets.SystemRandom().randint(0, len(essay)) + corrupted_essay = essay[:rand] + secrets.choice(string.ascii_letters) + essay[rand + 1 :] instructions.append("Fix typing errors in this essay" + corrupted_essay) diff --git a/text-frontend/__main__.py b/text-frontend/__main__.py index 3e90910be3..959f7f8a1e 100644 --- a/text-frontend/__main__.py +++ b/text-frontend/__main__.py @@ -1,10 +1,10 @@ """Simple REPL frontend.""" import http -import random import requests import typer +import secrets app = typer.Typer() @@ -14,7 +14,7 @@ def _random_message_id(): - return str(random.randint(1000, 9999)) + return str(secrets.SystemRandom().randint(1000, 9999)) def _render_message(message: dict) -> str: diff --git a/text-frontend/auto_main.py b/text-frontend/auto_main.py index 101e86c0a5..2fd31c5ee7 100644 --- a/text-frontend/auto_main.py +++ b/text-frontend/auto_main.py @@ -1,12 +1,12 @@ """Simple REPL frontend.""" import http -import random from uuid import uuid4 import requests import typer from faker import Faker +import secrets app = typer.Typer() fake = Faker() @@ -37,7 +37,7 @@ def _post(path: str, json: dict) -> dict: return response.json() def gen_random_text(): - return " ".join([random.choice(["hello", "world", "foo", "bar"]) for _ in range(10)]) + return " ".join([secrets.choice(["hello", "world", "foo", "bar"]) for _ in range(10)]) def gen_random_ranking(messages): """rank messages randomly and return list of indexes in order of rank randomly""" @@ -45,7 +45,7 @@ def gen_random_ranking(messages): print(messages) print(len(messages)) ranks = [i for i in range(len(messages))] - shuffled = random.shuffle(ranks) + shuffled = secrets.SystemRandom().shuffle(ranks) print(ranks) print(shuffled) return ranks @@ -108,15 +108,15 @@ def gen_random_ranking(messages): labels_dict = None if task["mode"] == "simple" and len(valid_labels) == 1: - answer = random.choice([True, False]) + answer = secrets.choice([True, False]) labels_dict = {valid_labels[0]: 1 if answer else 0} else: - labels = random.sample(valid_labels, random.randint(1, len(valid_labels))) + labels = secrets.SystemRandom().sample(valid_labels, secrets.SystemRandom().randint(1, len(valid_labels))) for l in mandatory_labels: if l not in labels: labels.append(l) - labels_dict = {label: random.random() for label in valid_labels} - if random.random() < 0.9: + labels_dict = {label: secrets.SystemRandom().random() for label in valid_labels} + if secrets.SystemRandom().random() < 0.9: labels_dict["spam"] = 0 labels_dict["lang_mismatch"] = 0 @@ -225,15 +225,15 @@ def gen_random_ranking(messages): labels_dict = None if task["mode"] == "simple" and len(valid_labels) == 1: - answer = random.choice([True, False]) + answer = secrets.choice([True, False]) labels_dict = {valid_labels[0]: 1 if answer else 0} else: - labels = random.sample(valid_labels, random.randint(1, len(valid_labels))) + labels = secrets.SystemRandom().sample(valid_labels, secrets.SystemRandom().randint(1, len(valid_labels))) for l in mandatory_labels: if l not in labels: labels.append(l) - labels_dict = {label: random.random() for label in valid_labels} - if random.random() < 0.9: + labels_dict = {label: secrets.SystemRandom().random() for label in valid_labels} + if secrets.SystemRandom().random() < 0.9: labels_dict["spam"] = 0 labels_dict["lang_mismatch"] = 0 From 8fd7727fa8e739a332481ca4271532b6fbf4e8c6 Mon Sep 17 00:00:00 2001 From: "pixeebot[bot]" <104101892+pixeebot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 23:03:24 -0400 Subject: [PATCH 3/3] Sandbox URL Creation (#8) Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --- data/datasets/biostars_qa/get_biostars_dataset.py | 4 ++-- data/datasets/oa_dolly_15k/create_dataset.py | 5 ++--- data/datasets/oa_stackexchange/download.py | 3 ++- data/datasets/youtube_subs_howto100M/prepare.py | 5 ++--- data/datasets/zhihu-kol/main.py | 8 ++++---- data/datasets/zhihu-kol/scrape_by_topic.py | 4 ++-- inference/worker/chat_chain_utils.py | 3 ++- inference/worker/openapi_parser.py | 5 +++-- inference/worker/utils.py | 3 ++- model/model_training/custom_datasets/prompt_dialogue.py | 5 ++--- oasst-shared/pyproject.toml | 3 ++- scripts/data_augment/data_augment.py | 3 ++- 12 files changed, 27 insertions(+), 24 deletions(-) diff --git a/data/datasets/biostars_qa/get_biostars_dataset.py b/data/datasets/biostars_qa/get_biostars_dataset.py index 17ae220343..040ed87385 100644 --- a/data/datasets/biostars_qa/get_biostars_dataset.py +++ b/data/datasets/biostars_qa/get_biostars_dataset.py @@ -4,8 +4,8 @@ import time import pandas as pd -import requests from tqdm import tqdm +from security import safe_requests def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, folder="biostars"): @@ -41,7 +41,7 @@ def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, print(f"MSG: {file} exists. Skipping; Current accepted: {has_accepted_count}") continue - r = requests.get(url, headers=headers, timeout=60) + r = safe_requests.get(url, headers=headers, timeout=60) # print(r.status_code, r.reason) diff --git a/data/datasets/oa_dolly_15k/create_dataset.py b/data/datasets/oa_dolly_15k/create_dataset.py index e1f1f75581..8464296e0b 100644 --- a/data/datasets/oa_dolly_15k/create_dataset.py +++ b/data/datasets/oa_dolly_15k/create_dataset.py @@ -1,15 +1,14 @@ import json from pathlib import Path - -import requests from datasets import Dataset +from security import safe_requests DATA_URL = "https://raw.githubusercontent.com/databrickslabs/dolly/master/data/databricks-dolly-15k.jsonl" FILE_PATH = "databricks_dolly_15k.jsonl" def download_data(url: str, destination: str): - response = requests.get(url, stream=True, timeout=60) + response = safe_requests.get(url, stream=True, timeout=60) with open(destination, "wb") as handle: for data in response.iter_content(): diff --git a/data/datasets/oa_stackexchange/download.py b/data/datasets/oa_stackexchange/download.py index b90c2579f2..aef005ca2d 100755 --- a/data/datasets/oa_stackexchange/download.py +++ b/data/datasets/oa_stackexchange/download.py @@ -14,6 +14,7 @@ import requests from bs4 import BeautifulSoup as bs +from security import safe_requests BASE_URL = "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml" DOWNLOAD_DIR = "xml/" @@ -50,7 +51,7 @@ def download_url(dataset_name: str, url: str): return cache_path else: print("Downloading xml: ", dataset_name) - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) print("Finished downloading: ", dataset_name) with open(cache_path, "wb") as f: f.write(response.content) diff --git a/data/datasets/youtube_subs_howto100M/prepare.py b/data/datasets/youtube_subs_howto100M/prepare.py index 6899841b7b..05bf1bb0a7 100644 --- a/data/datasets/youtube_subs_howto100M/prepare.py +++ b/data/datasets/youtube_subs_howto100M/prepare.py @@ -8,11 +8,10 @@ import urllib import zipfile from typing import List - -import requests from tqdm import tqdm from youtube_transcript_api import YouTubeTranscriptApi import secrets +from security import safe_requests def get_video_ids(raw_file: str, video_id_pattern: str) -> List[str]: @@ -74,7 +73,7 @@ def main(output_dir: str = "data"): print("Downloading HowTo100M raw_caption.zip...") print(" might take some time(3.4G)...") url = "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/raw_caption.zip" - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) zipped = zipfile.ZipFile(io.BytesIO(response.content)) zipped.extractall("./temp") diff --git a/data/datasets/zhihu-kol/main.py b/data/datasets/zhihu-kol/main.py index d554320124..1f2c0497c4 100644 --- a/data/datasets/zhihu-kol/main.py +++ b/data/datasets/zhihu-kol/main.py @@ -4,10 +4,10 @@ import multitasking import pandas as pd -import requests from bs4 import BeautifulSoup from retry import retry from tqdm import tqdm +from security import safe_requests def get_uid_by_url_token(url_token: str) -> str: @@ -41,7 +41,7 @@ def get_uid_by_url_token(url_token: str) -> str: } url = "https://api.zhihu.com/people/" + url_token - response = requests.get(url, headers=headers, timeout=60) + response = safe_requests.get(url, headers=headers, timeout=60) uid = response.json()["id"] return uid @@ -100,7 +100,7 @@ def get_user_answers(url_token: str, max_count: int = 100000) -> pd.DataFrame: ("offset", f"{offset}"), ) - response = requests.get(url, headers=headers, params=params, timeout=60) + response = safe_requests.get(url, headers=headers, params=params, timeout=60) if response.json().get("paging") is None: return pd.DataFrame(columns=operations.keys()) @@ -148,7 +148,7 @@ def get_answer_content(qid: str, aid) -> str: "Host": "www.zhihu.com", } url = f"https://www.zhihu.com/question/{qid}/answer/{aid}" - response = requests.get(url, headers=headers, timeout=60) + response = safe_requests.get(url, headers=headers, timeout=60) soup = BeautifulSoup(response.text, "html.parser") content = " ".join([p.text.strip() for p in soup.find_all("p")]) diff --git a/data/datasets/zhihu-kol/scrape_by_topic.py b/data/datasets/zhihu-kol/scrape_by_topic.py index dd8dc685d4..97aad2f8b7 100644 --- a/data/datasets/zhihu-kol/scrape_by_topic.py +++ b/data/datasets/zhihu-kol/scrape_by_topic.py @@ -8,11 +8,11 @@ import numpy as np import pandas as pd -import requests from bs4 import BeautifulSoup from loguru import logger from playwright.sync_api import Locator, Page, sync_playwright from tqdm import tqdm +from security import safe_requests @dataclass @@ -46,7 +46,7 @@ def get_answer_content(qid: int, aid: int, question_str: str) -> str: "Host": "www.zhihu.com", } url = f"https://www.zhihu.com/question/{qid}/answer/{aid}" - response = requests.get(url, headers=headers, timeout=60) + response = safe_requests.get(url, headers=headers, timeout=60) soup = BeautifulSoup(response.text, "html.parser") content = " ".join([p.text.strip() for p in soup.find_all("p")]) diff --git a/inference/worker/chat_chain_utils.py b/inference/worker/chat_chain_utils.py index 602c925333..0185ac7781 100644 --- a/inference/worker/chat_chain_utils.py +++ b/inference/worker/chat_chain_utils.py @@ -14,6 +14,7 @@ from openapi_parser import prepare_plugin_for_llm from settings import settings from utils import shared_tokenizer_lock, special_tokens +from security import safe_requests RESPONSE_MAX_LENGTH = 2048 DESCRIPTION_FOR_MODEL_MAX_LENGTH = 512 @@ -205,7 +206,7 @@ def run_request(self, params: str, url: str, param_location: str, type: str, pay logger.info( f"Running {type.upper()} request on {url} with\nparams: {params}\nparam_location: {param_location}\npayload: {payload}" ) - res = requests.get(url, params=query_params, headers=headers, timeout=60) + res = safe_requests.get(url, params=query_params, headers=headers, timeout=60) elif type.lower() == "post": # if model did not generate payload object, use params as payload data = json.dumps(payload) if payload else json.dumps(params) diff --git a/inference/worker/openapi_parser.py b/inference/worker/openapi_parser.py index 26a9f51eb4..136e75b343 100644 --- a/inference/worker/openapi_parser.py +++ b/inference/worker/openapi_parser.py @@ -5,10 +5,11 @@ import yaml from loguru import logger from oasst_shared.schemas import inference +from security import safe_requests def fetch_openapi_spec(url): - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) if response.status_code != 200: raise Exception(f"Failed to fetch data from URL: {url}. Status code: {response.status_code}") @@ -29,7 +30,7 @@ def fetch_openapi_spec(url): def get_plugin_config(url: str) -> inference.PluginConfig | None: try: - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) response.raise_for_status() plugin_dict = response.json() logger.info(f"Plugin config downloaded {plugin_dict}") diff --git a/inference/worker/utils.py b/inference/worker/utils.py index 2add1a6d94..4c9a61559f 100644 --- a/inference/worker/utils.py +++ b/inference/worker/utils.py @@ -14,6 +14,7 @@ from oasst_shared.schemas import inference from settings import settings import secrets +from security import safe_requests shared_tokenizer_lock = threading.Lock() @@ -258,7 +259,7 @@ def _maybe_add_bearer_token(self, headers: dict[str, str] | None): def get(self, path: str, **kwargs): kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers")) - return requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60) + return safe_requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60) def post(self, path: str, **kwargs): kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers")) diff --git a/model/model_training/custom_datasets/prompt_dialogue.py b/model/model_training/custom_datasets/prompt_dialogue.py index b3f41f8376..39816fcd25 100644 --- a/model/model_training/custom_datasets/prompt_dialogue.py +++ b/model/model_training/custom_datasets/prompt_dialogue.py @@ -3,14 +3,13 @@ import re from pathlib import Path from typing import List, Mapping, Optional, Sequence, Union - -import requests from datasets import load_dataset from model_training.custom_datasets.formatting import DatasetEntrySft, Role, Utterance from model_training.custom_datasets.oasst_dataset import ListDataset from model_training.custom_datasets.utils import _filter_by_words from torch import Generator, randperm from torch.utils.data import Dataset, random_split +from security import safe_requests def load_oig_file( @@ -34,7 +33,7 @@ def load_oig_file( # download file if not cached if not local_path.exists() or local_path.stat().st_size == 0 or no_cache: print(f"downloading {source_url} to {local_path}") - r = requests.get(source_url, stream=True, timeout=60) + r = safe_requests.get(source_url, stream=True, timeout=60) with local_path.open(mode="wb") as fd: for chunk in r.iter_content(chunk_size=1024 * 1024): fd.write(chunk) diff --git a/oasst-shared/pyproject.toml b/oasst-shared/pyproject.toml index baee4f9e18..b93695ce5e 100644 --- a/oasst-shared/pyproject.toml +++ b/oasst-shared/pyproject.toml @@ -11,7 +11,8 @@ dependencies = [ "aiohttp[speedups]", "loguru==0.6.0", "psutil==5.9.4", - "pynvml==11.5.0" + "pynvml==11.5.0", + "security==1.2.1" ] [project.optional-dependencies] diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py index 6ba8213153..04e0992ba6 100644 --- a/scripts/data_augment/data_augment.py +++ b/scripts/data_augment/data_augment.py @@ -26,6 +26,7 @@ from syntax.syntax_injector import SyntaxBug from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, pipeline import secrets +from security import safe_requests class DataAugmenter: @@ -245,7 +246,7 @@ def parse(self, _): xml_posts_path = urls.get(dataset_name) - response = requests.get(xml_posts_path, timeout=60) + response = safe_requests.get(xml_posts_path, timeout=60) df = self.xml_to_df(response) df = self.filter(df)