diff --git a/textwiz/__init__.py b/textwiz/__init__.py
index 902aedf..02615a4 100644
--- a/textwiz/__init__.py
+++ b/textwiz/__init__.py
@@ -8,11 +8,11 @@
 from .streamer import TextContinuationStreamer
 # import it here so that the warnings are suppressed when doing `import textwiz`
 from . import warnings_suppressor
-# also import some of the submodules for convenience
+# also directly import some of the submodules for convenience and auto-complete
 from . import loader, conversation_template, prompt_template
 
 
-__version__ = '0.0.7'
+__version__ = '0.1.0'
 
 
 def is_chat_model(model_name: str) -> bool:
@@ -34,3 +34,85 @@ def is_chat_model(model_name: str) -> bool:
 
     template = get_prompt_template(model_name)
     return template.default_mode == 'chat'
+
+
+def estimate_number_of_gpus(models: list[str], quantization_8bits: bool = False, quantization_4bits: bool = False,
+                            max_fraction_gpu_0: float = 0.8, max_fraction_gpus: float = 0.8) -> list[int]:
+    """Estimate the mumber of gpus needed to run each of the `models` correctly.
+
+    Parameters
+    ----------
+    models : list[str]
+        The models.
+    quantization_8bits : bool
+        Whether the model will be loaded in 8 bits mode, by default False.
+    quantization_4bits : bool
+        Whether the model will be loaded in 4 bits mode, by default False.
+    max_fraction_gpu_0 : float, optional
+        The maximum fraction of the gpu 0 memory to reserve for the model. The default is 0.8.
+    max_fraction_gpus : float, optional
+        The maximum fraction of the other gpus memory to reserve for the model. The default is 0.8.
+
+    Returns
+    -------
+    list[int]
+        The number of gpus for each model.
+    """
+    
+    model_footprints = []
+    for model in models:
+        # Override quantization for bloom because it's too big to load in float16
+        if model == 'bloom-176B' and not (quantization_8bits or quantization_4bits):
+            gpu_needed, _ = estimate_model_gpu_footprint(model, quantization_8bits=True, quantization_4bits=False,
+                                                         max_fraction_gpu_0=0.9, max_fraction_gpus=0.9)
+        else:
+            gpu_needed, _ = estimate_model_gpu_footprint(model, quantization_8bits=quantization_8bits,
+                                                         quantization_4bits=quantization_4bits,
+                                                         max_fraction_gpu_0=max_fraction_gpu_0,
+                                                         max_fraction_gpus=max_fraction_gpus)
+        model_footprints.append(gpu_needed)
+
+    return model_footprints
+
+
+
+# Relatively small models (they should fit on a single A100 40GB GPU)
+SMALL_MODELS = tuple(model for model in loader.ALLOWED_MODELS if loader.ALL_MODELS_PARAMS[model] <= 16)
+# Large models (they require more than 1 A100 40GB GPU)
+LARGE_MODELS = tuple(model for model in loader.ALLOWED_MODELS if loader.ALL_MODELS_PARAMS[model] > 16)
+
+assert set(loader.ALLOWED_MODELS) == set(SMALL_MODELS + LARGE_MODELS), 'We are somehow missing some models...'
+
+
+# Model with non-default prompt template
+SMALL_MODELS_SPECIAL_PROMPT = tuple(model for model in SMALL_MODELS if model in prompt_template.PROMPT_MAPPING.keys())
+LARGE_MODELS_SPECIAL_PROMPT = tuple(model for model in LARGE_MODELS if model in prompt_template.PROMPT_MAPPING.keys())
+
+
+
+# Models that we decided to keep for further code benchmarks
+GOOD_CODERS = (
+    'star-coder-base',
+    'star-coder',
+    'star-chat-alpha',
+    'codegen-16B',
+    'codegen25-7B',
+    'codegen25-7B-instruct',
+    'code-llama-34B',
+    'code-llama-34B-python',
+    'code-llama-34B-instruct',
+    'llama2-70B',
+    'llama2-70B-chat',
+)
+
+
+SMALL_GOOD_CODERS = tuple(model for model in GOOD_CODERS if model in SMALL_MODELS)
+LARGE_GOOD_CODERS = tuple(model for model in GOOD_CODERS if model in LARGE_MODELS)
+
+
+assert set(GOOD_CODERS) == set(SMALL_GOOD_CODERS + LARGE_GOOD_CODERS), 'We are somehow missing some good coder models...'
+
+
+# Model that we decided to keep for further code benchmarks with non-default prompt template
+SMALL_GOOD_CODERS_SPECIAL_PROMPT = tuple(model for model in SMALL_GOOD_CODERS if model in prompt_template.PROMPT_MAPPING.keys())
+LARGE_GOOD_CODERS_SPECIAL_PROMPT = tuple(model for model in LARGE_GOOD_CODERS if model in prompt_template.PROMPT_MAPPING.keys())
\ No newline at end of file
diff --git a/textwiz/web_interface.py b/textwiz/web_interface.py
new file mode 100644
index 0000000..3fef1fe
--- /dev/null
+++ b/textwiz/web_interface.py
@@ -0,0 +1,370 @@
+"""This module provides convenience functions to use when creating a Gradio web app."""
+import queue
+import copy
+from concurrent.futures import ThreadPoolExecutor
+
+from transformers import TextIteratorStreamer
+import gradio as gr
+
+from .generation import HFModel
+from .streamer import TextContinuationStreamer
+from .conversation_template import GenericConversation
+
+
+TIMEOUT = 20
+
+
+def text_generation(model: HFModel, prompt: str, max_new_tokens: int, do_sample: bool, top_k: int, top_p: float,
+                    temperature: float, use_seed: bool, seed: int, **kwargs) -> str:
+    """Text generation with `model`. This is a generator yielding tokens as they are generated.
+
+    Parameters
+    ----------
+    model : HFModel
+        The model to use for generation.
+    prompt : str
+        The prompt to the model.
+    max_new_tokens : int
+        How many new tokens to generate.
+    do_sample : bool
+        Whether to introduce randomness in the generation.
+    top_k : int,
+        How many tokens with max probability to consider for randomness.
+    top_p : float
+        The probability density covering the new tokens to consider for randomness.
+    temperature : float
+        How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search,
+        no randomness).
+    use_seed : bool
+        Whether to use a fixed seed for reproducibility.
+    seed : int
+        An optional seed to force the generation to be reproducible.
+
+    Yields
+    ------
+    Iterator[str]
+        String containing the sequence generated.
+    """
+    
+    if not use_seed:
+        seed = None
+
+    # To show text as it is being generated
+    streamer = TextIteratorStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True)
+
+    # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We
+    # use an executor because it makes it easier to catch possible exceptions
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(model.generate_text, prompt, max_new_tokens=max_new_tokens, do_sample=do_sample,
+                                 top_k=top_k, top_p=top_p, temperature=temperature, seed=seed,
+                                 truncate_prompt_from_output=True, streamer=streamer, **kwargs)
+    
+        # Get results from the streamer and yield it
+        try:
+            # Ask the streamer to skip prompt and reattach it here to avoid showing special prompt formatting
+            generated_text = prompt
+            for new_text in streamer:
+                generated_text += new_text
+                yield generated_text
+
+        # If for some reason the queue (from the streamer) is still empty after timeout, we probably
+        # encountered an exception
+        except queue.Empty:
+            e = future.exception()
+            if e is not None:
+                raise gr.Error(f'The following error happened during generation: {repr(e)}')
+            else:
+                raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)')
+    
+        # Get actual result and yield it (which may be slightly different due to postprocessing)
+        generated_text = future.result()
+        yield prompt + generated_text
+
+
+
+def chat_generation(model: HFModel, conversation: GenericConversation, prompt: str, max_new_tokens: int,
+                    do_sample: bool,top_k: int, top_p: float, temperature: float, use_seed: bool, seed: int,
+                    **kwargs) -> tuple[str, GenericConversation, list[list]]:
+    """Chat generation with `model`. This is a generator yielding tokens as they are generated.
+
+    Parameters
+    ----------
+    model : HFModel
+        The model to use for generation.
+    conversation : GenericConversation
+        Current conversation. This is the value inside a gr.State instance.
+    prompt : str
+        The prompt to the model.
+    max_new_tokens : int
+        How many new tokens to generate.
+    do_sample : bool
+        Whether to introduce randomness in the generation.
+    top_k : int
+        How many tokens with max probability to consider for randomness.
+    top_p : float
+        The probability density covering the new tokens to consider for randomness.
+    temperature : float
+        How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search,
+        no randomness).
+    use_seed : bool, optional
+        Whether to use a fixed seed for reproducibility., by default False.
+    seed : int
+        An optional seed to force the generation to be reproducible.
+
+    Yields
+    ------
+    Iterator[tuple[str, GenericConversation, list[list]]]
+        Correspond to gradio components (prompt, conversation, chatbot).
+    """
+    
+    if not use_seed:
+        seed = None
+
+    # To show text as it is being generated
+    streamer = TextIteratorStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True)
+
+    conv_copy = copy.deepcopy(conversation)
+    conv_copy.append_user_message(prompt)
+    
+    # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We
+    # use an executor because it makes it easier to catch possible exceptions
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        # This will update `conversation` in-place
+        future = executor.submit(model.generate_conversation, prompt, conv_history=conversation,
+                                 max_new_tokens=max_new_tokens, do_sample=do_sample, top_k=top_k, top_p=top_p,
+                                 temperature=temperature, seed=seed, truncate_if_conv_too_long=True,
+                                 streamer=streamer, **kwargs)
+        
+        # Get results from the streamer and yield it
+        try:
+            generated_text = ''
+            for new_text in streamer:
+                generated_text += new_text
+                # Update model answer (on a copy of the conversation) as it is being generated
+                conv_copy.model_history_text[-1] = generated_text
+                # The first output is an empty string to clear the input box, the second is the format output
+                # to use in a gradio chatbot component
+                yield '', conv_copy, conv_copy.to_gradio_format()
+
+        # If for some reason the queue (from the streamer) is still empty after timeout, we probably
+        # encountered an exception
+        except queue.Empty:
+            e = future.exception()
+            if e is not None:
+                raise gr.Error(f'The following error happened during generation: {repr(e)}')
+            else:
+                raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)')
+    
+    # Update the chatbot with the real conversation (which may be slightly different due to postprocessing)
+    yield '', conversation, conversation.to_gradio_format()
+
+
+
+def continue_generation(model: HFModel, conversation: GenericConversation, additional_max_new_tokens: int,
+                        do_sample: bool, top_k: int, top_p: float, temperature: float, use_seed: bool,
+                        seed: int, **kwargs) -> tuple[GenericConversation, list[list]]:
+    """Continue the last turn of the `model` output. This is a generator yielding tokens as they are generated.
+
+    Parameters
+    ----------
+    model : HFModel
+        The model to use for generation.
+    conversation : GenericConversation
+        Current conversation. This is the value inside a gr.State instance.
+    additional_max_new_tokens : int
+        How many new tokens to generate.
+    do_sample : bool
+        Whether to introduce randomness in the generation.
+    top_k : int
+        How many tokens with max probability to consider for randomness.
+    top_p : float
+        The probability density covering the new tokens to consider for randomness.
+    temperature : float
+        How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search,
+        no randomness).
+    use_seed : bool
+        Whether to use a fixed seed for reproducibility.
+    seed : int
+        An optional seed to force the generation to be reproducible.
+
+    Yields
+    ------
+    Iterator[tuple[GenericConversation, list[list]]]
+        Correspond to gradio components (conversation, chatbot).
+    """
+
+    if len(conversation) == 0:
+        gr.Warning(f'You cannot continue an empty conversation.')
+        yield conversation, conversation.to_gradio_format()
+        return
+    if conversation.model_history_text[-1] is None:
+        gr.Warning('You cannot continue an empty last turn.')
+        yield conversation, conversation.to_gradio_format()
+        return
+
+    if not use_seed:
+        seed = None
+
+    # To show text as it is being generated
+    streamer = TextContinuationStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True)
+
+    conv_copy = copy.deepcopy(conversation)
+    
+    # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We
+    # use an executor because it makes it easier to catch possible exceptions
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        # This will update `conversation` in-place
+        future = executor.submit(model.continue_last_conversation_turn, conv_history=conversation,
+                                 max_new_tokens=additional_max_new_tokens, do_sample=do_sample, top_k=top_k, top_p=top_p,
+                                 temperature=temperature, seed=seed, truncate_if_conv_too_long=True, streamer=streamer,
+                                 **kwargs)
+        
+        # Get results from the streamer and yield it
+        try:
+            generated_text = conv_copy.model_history_text[-1]
+            for new_text in streamer:
+                generated_text += new_text
+                # Update model answer (on a copy of the conversation) as it is being generated
+                conv_copy.model_history_text[-1] = generated_text
+                # The first output is an empty string to clear the input box, the second is the format output
+                # to use in a gradio chatbot component
+                yield conv_copy, conv_copy.to_gradio_format()
+
+        # If for some reason the queue (from the streamer) is still empty after timeout, we probably
+        # encountered an exception
+        except queue.Empty:
+            e = future.exception()
+            if e is not None:
+                raise gr.Error(f'The following error happened during generation: {repr(e)}')
+            else:
+                raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)')
+
+    # Update the chatbot with the real conversation (which may be slightly different due to postprocessing)
+    yield conversation, conversation.to_gradio_format()
+
+
+
+def retry_chat_generation(model: HFModel, conversation: GenericConversation, max_new_tokens: int, do_sample: bool,
+                          top_k: int, top_p: float, temperature: float, use_seed: bool,
+                          seed: int, **kwargs) -> tuple[GenericConversation, list[list]]:
+    """Regenerate the last turn of the conversation. This is a generator yielding tokens as they are generated.
+
+    Parameters
+    ----------
+    model : HFModel
+        The model to use for generation.
+    conversation : GenericConversation
+        Current conversation. This is the value inside a gr.State instance.
+    max_new_tokens : int
+        How many new tokens to generate.
+    do_sample : bool
+        Whether to introduce randomness in the generation.
+    top_k : int
+        How many tokens with max probability to consider for randomness.
+    top_p : float
+        The probability density covering the new tokens to consider for randomness.
+    temperature : float
+        How to cool down the probability distribution. Value between 1 (no cooldown) and 0 (greedy search,
+        no randomness).
+    use_seed : bool
+        Whether to use a fixed seed for reproducibility.
+    seed : int
+        An optional seed to force the generation to be reproducible.
+
+    Yields
+    ------
+    Iterator[tuple[GenericConversation, list[list]]]
+        Correspond to gradio components (conversation, chatbot).
+    """
+
+    if len(conversation) == 0:
+        gr.Warning(f'You cannot retry generation on an empty conversation.')
+        yield conversation, conversation.to_gradio_format()
+        return
+    if conversation.model_history_text[-1] is None:
+        gr.Warning('You cannot retry generation on an empty last turn')
+        yield conversation, conversation.to_gradio_format()
+        return
+    
+    if not use_seed:
+        seed = None
+
+    # Remove last turn
+    prompt = conversation.user_history_text[-1]
+    _ = conversation.user_history_text.pop(-1)
+    _ = conversation.model_history_text.pop(-1)
+
+    # To show text as it is being generated
+    streamer = TextIteratorStreamer(model.tokenizer, skip_prompt=True, timeout=TIMEOUT, skip_special_tokens=True)
+
+    conv_copy = copy.deepcopy(conversation)
+    conv_copy.append_user_message(prompt)
+    
+    # We need to launch a new thread to get text from the streamer in real-time as it is being generated. We
+    # use an executor because it makes it easier to catch possible exceptions
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        # This will update `conversation` in-place
+        future = executor.submit(model.generate_conversation, prompt, system_prompt=None, conv_history=conversation,
+                                 max_new_tokens=max_new_tokens, do_sample=do_sample, top_k=top_k, top_p=top_p,
+                                 temperature=temperature, seed=seed, truncate_if_conv_too_long=True, streamer=streamer,
+                                 **kwargs)
+        
+        # Get results from the streamer and yield it
+        try:
+            generated_text = ''
+            for new_text in streamer:
+                generated_text += new_text
+                # Update model answer (on a copy of the conversation) as it is being generated
+                conv_copy.model_history_text[-1] = generated_text
+                # The first output is an empty string to clear the input box, the second is the format output
+                # to use in a gradio chatbot component
+                yield conv_copy, conv_copy.to_gradio_format()
+
+        # If for some reason the queue (from the streamer) is still empty after timeout, we probably
+        # encountered an exception
+        except queue.Empty:
+            e = future.exception()
+            if e is not None:
+                raise gr.Error(f'The following error happened during generation: {repr(e)}')
+            else:
+                raise gr.Error(f'Generation timed out (no new tokens were generated after {TIMEOUT} s)')
+    
+    
+    # Update the chatbot with the real conversation (which may be slightly different due to postprocessing)
+    yield conversation, conversation.to_gradio_format()
+
+
+
+def simple_authentication(credentials_file: str, username: str, password: str) -> bool:
+    """Simple authentication method.
+
+    Parameters
+    ----------
+    credentials_file : str
+        Path to the credentials.
+    username : str
+        The username provided.
+    password : str
+        The password provided.
+
+    Returns
+    -------
+    bool
+        Return True if both the username and password match some credentials stored in `credentials_file`. 
+        False otherwise.
+    """
+
+    with open(credentials_file, 'r') as file:
+        # Read lines and remove whitespaces
+        lines = [line.strip() for line in file.readlines() if line.strip() != '']
+
+    valid_usernames = lines[0::2]
+    valid_passwords = lines[1::2]
+
+    if username in valid_usernames:
+        index = valid_usernames.index(username)
+        # Check that the password also matches at the corresponding index
+        if password == valid_passwords[index]:
+            return True
+    
+    return False
\ No newline at end of file