diff --git a/textwiz/__init__.py b/textwiz/__init__.py index 902aedf..02615a4 100644 --- a/textwiz/__init__.py +++ b/textwiz/__init__.py @@ -8,11 +8,11 @@ from .streamer import TextContinuationStreamer # import it here so that the warnings are suppressed when doing `import textwiz` from . import warnings_suppressor -# also import some of the submodules for convenience +# also directly import some of the submodules for convenience and auto-complete from . import loader, conversation_template, prompt_template -__version__ = '0.0.7' +__version__ = '0.1.0' def is_chat_model(model_name: str) -> bool: @@ -34,3 +34,85 @@ def is_chat_model(model_name: str) -> bool: template = get_prompt_template(model_name) return template.default_mode == 'chat' + + +def estimate_number_of_gpus(models: list[str], quantization_8bits: bool = False, quantization_4bits: bool = False, + max_fraction_gpu_0: float = 0.8, max_fraction_gpus: float = 0.8) -> list[int]: + """Estimate the mumber of gpus needed to run each of the `models` correctly. + + Parameters + ---------- + models : list[str] + The models. + quantization_8bits : bool + Whether the model will be loaded in 8 bits mode, by default False. + quantization_4bits : bool + Whether the model will be loaded in 4 bits mode, by default False. + max_fraction_gpu_0 : float, optional + The maximum fraction of the gpu 0 memory to reserve for the model. The default is 0.8. + max_fraction_gpus : float, optional + The maximum fraction of the other gpus memory to reserve for the model. The default is 0.8. + + Returns + ------- + list[int] + The number of gpus for each model. + """ + + model_footprints = [] + for model in models: + # Override quantization for bloom because it's too big to load in float16 + if model == 'bloom-176B' and not (quantization_8bits or quantization_4bits): + gpu_needed, _ = estimate_model_gpu_footprint(model, quantization_8bits=True, quantization_4bits=False, + max_fraction_gpu_0=0.9, max_fraction_gpus=0.9) + else: + gpu_needed, _ = estimate_model_gpu_footprint(model, quantization_8bits=quantization_8bits, + quantization_4bits=quantization_4bits, + max_fraction_gpu_0=max_fraction_gpu_0, + max_fraction_gpus=max_fraction_gpus) + model_footprints.append(gpu_needed) + + return model_footprints + + + +# Relatively small models (they should fit on a single A100 40GB GPU) +SMALL_MODELS = tuple(model for model in loader.ALLOWED_MODELS if loader.ALL_MODELS_PARAMS[model] <= 16) +# Large models (they require more than 1 A100 40GB GPU) +LARGE_MODELS = tuple(model for model in loader.ALLOWED_MODELS if loader.ALL_MODELS_PARAMS[model] > 16) + +assert set(loader.ALLOWED_MODELS) == set(SMALL_MODELS + LARGE_MODELS), 'We are somehow missing some models...' + + +# Model with non-default prompt template +SMALL_MODELS_SPECIAL_PROMPT = tuple(model for model in SMALL_MODELS if model in prompt_template.PROMPT_MAPPING.keys()) +LARGE_MODELS_SPECIAL_PROMPT = tuple(model for model in LARGE_MODELS if model in prompt_template.PROMPT_MAPPING.keys()) + + + +# Models that we decided to keep for further code benchmarks +GOOD_CODERS = ( + 'star-coder-base', + 'star-coder', + 'star-chat-alpha', + 'codegen-16B', + 'codegen25-7B', + 'codegen25-7B-instruct', + 'code-llama-34B', + 'code-llama-34B-python', + 'code-llama-34B-instruct', + 'llama2-70B', + 'llama2-70B-chat', +) + + +SMALL_GOOD_CODERS = tuple(model for model in GOOD_CODERS if model in SMALL_MODELS) +LARGE_GOOD_CODERS = tuple(model for model in GOOD_CODERS if model in LARGE_MODELS) + + +assert set(GOOD_CODERS) == set(SMALL_GOOD_CODERS + LARGE_GOOD_CODERS), 'We are somehow missing some good coder models...' + + +# Model that we decided to keep for further code benchmarks with non-default prompt template +SMALL_GOOD_CODERS_SPECIAL_PROMPT = tuple(model for model in SMALL_GOOD_CODERS if model in prompt_template.PROMPT_MAPPING.keys()) +LARGE_GOOD_CODERS_SPECIAL_PROMPT = tuple(model for model in LARGE_GOOD_CODERS if model in prompt_template.PROMPT_MAPPING.keys()) \ No newline at end of file diff --git a/textwiz/web_interface.py b/textwiz/web_interface.py new file mode 100644 index 0000000..3fef1fe --- /dev/null +++ b/textwiz/web_interface.py @@ -0,0 +1,370 @@ +"""This module provides convenience functions to use when creating a Gradio web app.""" +import queue +import copy +from concurrent.futures import ThreadPoolExecutor + +from transformers import TextIteratorStreamer +import gradio as gr + +from .generation import HFModel +from .streamer import TextContinuationStreamer +from .conversation_template import GenericConversation + + +TIMEOUT = 20 + + +def text_generation(model: HFModel, prompt: str, max_new_tokens: int, do_sample: bool, top_k: int, top_p: float, + temperature: float, use_seed: bool, seed: int, **kwargs) -> str: + """Text generation with `model`. This is a generator yielding tokens as they are generated. + + Parameters + ---------- + model : HFModel + The model to use for generation. + prompt : str + The prompt to the model. + max_new_tokens : int + How many new tokens to generate. + do_sample : bool + Whether to introduce randomness in the generation. + top_k : int, + How many tokens with max probability to consider for randomness. + top_p : float + The probability density covering the new tokens to consider for randomness. + temperature : float + How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search, + no randomness). + use_seed : bool + Whether to use a fixed seed for reproducibility. + seed : int + An optional seed to force the generation to be reproducible. + + Yields + ------ + Iterator[str] + String containing the sequence generated. + """ + + if not use_seed: + seed = None + + # To show text as it is being generated + streamer = TextIteratorStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True) + + # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We + # use an executor because it makes it easier to catch possible exceptions + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(model.generate_text, prompt, max_new_tokens=max_new_tokens, do_sample=do_sample, + top_k=top_k, top_p=top_p, temperature=temperature, seed=seed, + truncate_prompt_from_output=True, streamer=streamer, **kwargs) + + # Get results from the streamer and yield it + try: + # Ask the streamer to skip prompt and reattach it here to avoid showing special prompt formatting + generated_text = prompt + for new_text in streamer: + generated_text += new_text + yield generated_text + + # If for some reason the queue (from the streamer) is still empty after timeout, we probably + # encountered an exception + except queue.Empty: + e = future.exception() + if e is not None: + raise gr.Error(f'The following error happened during generation: {repr(e)}') + else: + raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)') + + # Get actual result and yield it (which may be slightly different due to postprocessing) + generated_text = future.result() + yield prompt + generated_text + + + +def chat_generation(model: HFModel, conversation: GenericConversation, prompt: str, max_new_tokens: int, + do_sample: bool,top_k: int, top_p: float, temperature: float, use_seed: bool, seed: int, + **kwargs) -> tuple[str, GenericConversation, list[list]]: + """Chat generation with `model`. This is a generator yielding tokens as they are generated. + + Parameters + ---------- + model : HFModel + The model to use for generation. + conversation : GenericConversation + Current conversation. This is the value inside a gr.State instance. + prompt : str + The prompt to the model. + max_new_tokens : int + How many new tokens to generate. + do_sample : bool + Whether to introduce randomness in the generation. + top_k : int + How many tokens with max probability to consider for randomness. + top_p : float + The probability density covering the new tokens to consider for randomness. + temperature : float + How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search, + no randomness). + use_seed : bool, optional + Whether to use a fixed seed for reproducibility., by default False. + seed : int + An optional seed to force the generation to be reproducible. + + Yields + ------ + Iterator[tuple[str, GenericConversation, list[list]]] + Correspond to gradio components (prompt, conversation, chatbot). + """ + + if not use_seed: + seed = None + + # To show text as it is being generated + streamer = TextIteratorStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True) + + conv_copy = copy.deepcopy(conversation) + conv_copy.append_user_message(prompt) + + # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We + # use an executor because it makes it easier to catch possible exceptions + with ThreadPoolExecutor(max_workers=1) as executor: + # This will update `conversation` in-place + future = executor.submit(model.generate_conversation, prompt, conv_history=conversation, + max_new_tokens=max_new_tokens, do_sample=do_sample, top_k=top_k, top_p=top_p, + temperature=temperature, seed=seed, truncate_if_conv_too_long=True, + streamer=streamer, **kwargs) + + # Get results from the streamer and yield it + try: + generated_text = '' + for new_text in streamer: + generated_text += new_text + # Update model answer (on a copy of the conversation) as it is being generated + conv_copy.model_history_text[-1] = generated_text + # The first output is an empty string to clear the input box, the second is the format output + # to use in a gradio chatbot component + yield '', conv_copy, conv_copy.to_gradio_format() + + # If for some reason the queue (from the streamer) is still empty after timeout, we probably + # encountered an exception + except queue.Empty: + e = future.exception() + if e is not None: + raise gr.Error(f'The following error happened during generation: {repr(e)}') + else: + raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)') + + # Update the chatbot with the real conversation (which may be slightly different due to postprocessing) + yield '', conversation, conversation.to_gradio_format() + + + +def continue_generation(model: HFModel, conversation: GenericConversation, additional_max_new_tokens: int, + do_sample: bool, top_k: int, top_p: float, temperature: float, use_seed: bool, + seed: int, **kwargs) -> tuple[GenericConversation, list[list]]: + """Continue the last turn of the `model` output. This is a generator yielding tokens as they are generated. + + Parameters + ---------- + model : HFModel + The model to use for generation. + conversation : GenericConversation + Current conversation. This is the value inside a gr.State instance. + additional_max_new_tokens : int + How many new tokens to generate. + do_sample : bool + Whether to introduce randomness in the generation. + top_k : int + How many tokens with max probability to consider for randomness. + top_p : float + The probability density covering the new tokens to consider for randomness. + temperature : float + How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search, + no randomness). + use_seed : bool + Whether to use a fixed seed for reproducibility. + seed : int + An optional seed to force the generation to be reproducible. + + Yields + ------ + Iterator[tuple[GenericConversation, list[list]]] + Correspond to gradio components (conversation, chatbot). + """ + + if len(conversation) == 0: + gr.Warning(f'You cannot continue an empty conversation.') + yield conversation, conversation.to_gradio_format() + return + if conversation.model_history_text[-1] is None: + gr.Warning('You cannot continue an empty last turn.') + yield conversation, conversation.to_gradio_format() + return + + if not use_seed: + seed = None + + # To show text as it is being generated + streamer = TextContinuationStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True) + + conv_copy = copy.deepcopy(conversation) + + # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We + # use an executor because it makes it easier to catch possible exceptions + with ThreadPoolExecutor(max_workers=1) as executor: + # This will update `conversation` in-place + future = executor.submit(model.continue_last_conversation_turn, conv_history=conversation, + max_new_tokens=additional_max_new_tokens, do_sample=do_sample, top_k=top_k, top_p=top_p, + temperature=temperature, seed=seed, truncate_if_conv_too_long=True, streamer=streamer, + **kwargs) + + # Get results from the streamer and yield it + try: + generated_text = conv_copy.model_history_text[-1] + for new_text in streamer: + generated_text += new_text + # Update model answer (on a copy of the conversation) as it is being generated + conv_copy.model_history_text[-1] = generated_text + # The first output is an empty string to clear the input box, the second is the format output + # to use in a gradio chatbot component + yield conv_copy, conv_copy.to_gradio_format() + + # If for some reason the queue (from the streamer) is still empty after timeout, we probably + # encountered an exception + except queue.Empty: + e = future.exception() + if e is not None: + raise gr.Error(f'The following error happened during generation: {repr(e)}') + else: + raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)') + + # Update the chatbot with the real conversation (which may be slightly different due to postprocessing) + yield conversation, conversation.to_gradio_format() + + + +def retry_chat_generation(model: HFModel, conversation: GenericConversation, max_new_tokens: int, do_sample: bool, + top_k: int, top_p: float, temperature: float, use_seed: bool, + seed: int, **kwargs) -> tuple[GenericConversation, list[list]]: + """Regenerate the last turn of the conversation. This is a generator yielding tokens as they are generated. + + Parameters + ---------- + model : HFModel + The model to use for generation. + conversation : GenericConversation + Current conversation. This is the value inside a gr.State instance. + max_new_tokens : int + How many new tokens to generate. + do_sample : bool + Whether to introduce randomness in the generation. + top_k : int + How many tokens with max probability to consider for randomness. + top_p : float + The probability density covering the new tokens to consider for randomness. + temperature : float + How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search, + no randomness). + use_seed : bool + Whether to use a fixed seed for reproducibility. + seed : int + An optional seed to force the generation to be reproducible. + + Yields + ------ + Iterator[tuple[GenericConversation, list[list]]] + Correspond to gradio components (conversation, chatbot). + """ + + if len(conversation) == 0: + gr.Warning(f'You cannot retry generation on an empty conversation.') + yield conversation, conversation.to_gradio_format() + return + if conversation.model_history_text[-1] is None: + gr.Warning('You cannot retry generation on an empty last turn') + yield conversation, conversation.to_gradio_format() + return + + if not use_seed: + seed = None + + # Remove last turn + prompt = conversation.user_history_text[-1] + _ = conversation.user_history_text.pop(-1) + _ = conversation.model_history_text.pop(-1) + + # To show text as it is being generated + streamer = TextIteratorStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True) + + conv_copy = copy.deepcopy(conversation) + conv_copy.append_user_message(prompt) + + # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We + # use an executor because it makes it easier to catch possible exceptions + with ThreadPoolExecutor(max_workers=1) as executor: + # This will update `conversation` in-place + future = executor.submit(model.generate_conversation, prompt, system_prompt=None, conv_history=conversation, + max_new_tokens=max_new_tokens, do_sample=do_sample, top_k=top_k, top_p=top_p, + temperature=temperature, seed=seed, truncate_if_conv_too_long=True, streamer=streamer, + **kwargs) + + # Get results from the streamer and yield it + try: + generated_text = '' + for new_text in streamer: + generated_text += new_text + # Update model answer (on a copy of the conversation) as it is being generated + conv_copy.model_history_text[-1] = generated_text + # The first output is an empty string to clear the input box, the second is the format output + # to use in a gradio chatbot component + yield conv_copy, conv_copy.to_gradio_format() + + # If for some reason the queue (from the streamer) is still empty after timeout, we probably + # encountered an exception + except queue.Empty: + e = future.exception() + if e is not None: + raise gr.Error(f'The following error happened during generation: {repr(e)}') + else: + raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)') + + + # Update the chatbot with the real conversation (which may be slightly different due to postprocessing) + yield conversation, conversation.to_gradio_format() + + + +def simple_authentication(credentials_file: str, username: str, password: str) -> bool: + """Simple authentication method. + + Parameters + ---------- + credentials_file : str + Path to the credentials. + username : str + The username provided. + password : str + The password provided. + + Returns + ------- + bool + Return True if both the username and password match some credentials stored in `credentials_file`. + False otherwise. + """ + + with open(credentials_file, 'r') as file: + # Read lines and remove whitespaces + lines = [line.strip() for line in file.readlines() if line.strip() != ''] + + valid_usernames = lines[0::2] + valid_passwords = lines[1::2] + + if username in valid_usernames: + index = valid_usernames.index(username) + # Check that the password also matches at the corresponding index + if password == valid_passwords[index]: + return True + + return False \ No newline at end of file