From 878dd333f2041180a909ac2c350409589cf3055d Mon Sep 17 00:00:00 2001
From: Aryan Pandey <aryanpandey8912@gmail.com>
Date: Tue, 31 Oct 2023 03:52:50 +0530
Subject: [PATCH 1/5] Adding Vertex AI (Google) API based fine-tuning

---
 .../Vertex_AI_Google_API_based_fine_tuning.py | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py
diff --git a/src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py b/src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py
new file mode 100644
index 00000000..2e209e7a
--- /dev/null
+++ b/src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+
+from typing import Optional
+
+
+from google.auth import default
+from google.cloud import aiplatform
+import pandas as pd
+import vertexai
+from vertexai.language_models import TextGenerationModel
+from vertexai.preview.language_models import TuningEvaluationSpec
+
+
+credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+
+
+def tuning(
+    project_id: str,
+    location: str,
+    model_display_name: str,
+    training_data: pd.DataFrame | str,
+    train_steps: int = 10,
+    evaluation_dataset: Optional[str] = None,
+    tensorboard_instance_name: Optional[str] = None,
+) -> TextGenerationModel:
+    """Tune a new model, based on a prompt-response data.
+
+    "training_data" can be either the GCS URI of a file formatted in JSONL format
+    (for example: training_data=f'gs://{bucket}/{filename}.jsonl'), or a pandas
+    DataFrame. Each training example should be JSONL record with two keys, for
+    example:
+      {
+        "input_text": <input prompt>,
+        "output_text": <associated output>
+      },
+    or the pandas DataFame should contain two columns:
+      ['input_text', 'output_text']
+    with rows for each training example.
+
+    Args:
+      project_id: GCP Project ID, used to initialize vertexai
+      location: GCP Region, used to initialize vertexai
+      model_display_name: Customized Tuned LLM model name.
+      training_data: GCS URI of jsonl file or pandas dataframe of training data.
+      train_steps: Number of training steps to use when tuning the model.
+      evaluation_dataset: GCS URI of jsonl file of evaluation data.
+      tensorboard_instance_name: The full name of the existing Vertex AI TensorBoard instance:
+        projects/PROJECT_ID/locations/LOCATION_ID/tensorboards/TENSORBOARD_INSTANCE_ID
+        Note that this instance must be in the same region as your tuning job.
+    """
+    vertexai.init(project=project_id, location=location, credentials=credentials)
+    eval_spec = TuningEvaluationSpec(evaluation_data=evaluation_dataset)
+    eval_spec.tensorboard = aiplatform.Tensorboard(
+        tensorboard_name=tensorboard_instance_name
+    )
+    model = TextGenerationModel.from_pretrained("text-bison@001")
+
+    model.tune_model(
+        training_data=training_data,
+        # Optional:
+        model_display_name=model_display_name,
+        train_steps=train_steps,
+        tuning_job_location="europe-west4",
+        tuned_model_location=location,
+        tuning_evaluation_spec=eval_spec,
+    )
+
+    print(model._job.status)
+
+    return model
+
+
+if __name__ == "__main__":
+    tuning()
\ No newline at end of file

From d764e11dd2da3b7e62443b0210996156fca9948e Mon Sep 17 00:00:00 2001
From: Aryan Pandey <aryanpandey8912@gmail.com>
Date: Wed, 1 Nov 2023 05:35:27 +0530
Subject: [PATCH 2/5] Adding Glossary list #258

---
 glossary.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/glossary.md b/glossary.md
index 7dc11ab9..d31ec7cf 100644
--- a/glossary.md
+++ b/glossary.md
@@ -50,3 +50,21 @@ It makes neural network models more memory-efficient and computationally faster
     3. The model's tendency to generate creative but incorrect information.
 
 Researchers and developers continually work to mitigate and reduce hallucination in LLMs to make them more reliable and trustworthy in their outputs.
+
+## References
+- ["Large Languge model"] (https://www.analyticsvidhya.com/blog/2023/03 an-introduction-to-large-language-models-llms/)
+
+- ["Transformer architecture"] (https://machinelearningmastery.com/the-transformer-model/)
+
+- ["Tokenization"] (https://www.analyticsvidhya.com/blog/2020/05/what-is-tokenization-nlp/)
+
+- ["Embedding Layer"] (https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/)
+
+- ["Prompt:"](https://machinelearningmastery.com/a-gentle-introduction-to-prompt-engineering/)
+
+- ["Transfer learning"] (https://machinelearningmastery.com/transfer-learning-for-deep-learning/)
+
+["Distributed learning"](https://cs.brown.edu/people/acrotty/pubs/Galakatos2017_ReferenceWorkEntry_DistributedMachineLearning.pdf)
+
+- ["Fine-tuning"](https://intellipaat.com/blog/fine-tuning/)
+- 
\ No newline at end of file

From addef15124cbe6a20556d2ccc4c9a91e072385cb Mon Sep 17 00:00:00 2001
From: Aryan Pandey <aryanpandey8912@gmail.com>
Date: Wed, 1 Nov 2023 17:21:09 +0530
Subject: [PATCH 3/5] Adding Load-balancing / auto-scaling for LLM serving on
 Google Cloud

---
 my-llm-deployment.yaml | 17 +++++++++++++++++
 my-llm-hpa.yaml        | 16 ++++++++++++++++
 my-llm-service.yaml    | 12 ++++++++++++
 3 files changed, 45 insertions(+)
 create mode 100644 my-llm-deployment.yaml
 create mode 100644 my-llm-hpa.yaml
 create mode 100644 my-llm-service.yaml

diff --git a/my-llm-deployment.yaml b/my-llm-deployment.yaml
new file mode 100644
index 00000000..774f64a9
--- /dev/null
+++ b/my-llm-deployment.yaml
@@ -0,0 +1,17 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-llm-deployment
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: my-llm-service
+  template:
+    metadata:
+      labels:
+        app: my-llm-service
+    spec:
+      containers:
+      - name: my-llm-container
+        image: gcr.io/your-project/llm-image:tag
diff --git a/my-llm-hpa.yaml b/my-llm-hpa.yaml
new file mode 100644
index 00000000..7ddf19fb
--- /dev/null
+++ b/my-llm-hpa.yaml
@@ -0,0 +1,16 @@
+apiVersion: autoscaling/v2beta2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: my-llm-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: my-llm-deployment
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      targetAverageUtilization: 50
diff --git a/my-llm-service.yaml b/my-llm-service.yaml
new file mode 100644
index 00000000..84433a8c
--- /dev/null
+++ b/my-llm-service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-llm-service
+spec:
+  selector:
+    app: my-llm-service
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 80
+  type: LoadBalancer

From ecab2a2ac2f382b9f4d547294dbfe6b3070e3cc0 Mon Sep 17 00:00:00 2001
From: Aryan Pandey <92007507+Aryan8912@users.noreply.github.com>
Date: Thu, 30 Nov 2023 05:14:55 +0530
Subject: [PATCH 4/5] Delete glossary.md

---
 glossary.md | 70 -----------------------------------------------------
 1 file changed, 70 deletions(-)
 delete mode 100644 glossary.md

diff --git a/glossary.md b/glossary.md
deleted file mode 100644
index d31ec7cf..00000000
--- a/glossary.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Glossary of Anarchy
-
-## Large Language Models
-
-Large Language Models, often referred to as "LLMs," are a class of artificial intelligence models designed for natural language understanding and generation tasks. These models are characterized by their massive size, extensive training data, and deep neural network architectures. LLMs have gained prominence in recent years due to their ability to generate coherent and contextually relevant text across a wide range of applications.
-
-**Transformer architecture:** a deep learning framework introduced in the paper "Attention Is All You Need." It excels in processing sequential data, such as text, by using a self-attention mechanism to capture dependencies and relationships between elements in a sequence. The architecture employs multi-head attention, allowing it to focus on different aspects of the input simultaneously. To handle the sequence's positional information, positional encodings are added. The Transformer has revolutionized natural language processing and machine translation due to its ability to efficiently model context and dependencies, making it a foundational component of many state-of-the-art language models and NLP applications.
-
-**Attention mechanism:** a fundamental component in deep learning models, particularly in the context of natural language processing (NLP) and computer vision. It enables models to selectively focus on specific parts of input data when making predictions or generating output. This mechanism allows the model to assign varying levels of importance to different elements in a sequence, considering their relevance in the context of the task. By capturing dependencies and context, the attention mechanism has significantly improved the performance of various machine learning applications, including machine translation, text summarization, image captioning, and more.
-
-**Tokenization:** a fundamental natural language processing (NLP) technique that involves breaking down a text into smaller units called "tokens." In the context of language processing, tokens can be words, subwords, or even individual characters, depending on the granularity chosen for analysis. Tokenization is the initial step in many NLP tasks and plays a crucial role in transforming unstructured text data into a format that can be processed by machine learning models. It helps to segment text into meaningful units, making it easier to analyze and manipulate, whether for tasks like text classification, sentiment analysis, machine translation, or language modeling. (one token = 3/4 word or 4 characters)
-
-**Embedding Layer:** a fundamental component of many natural language processing (NLP) and deep learning models, especially those used for text analysis. It is a layer in a neural network that is responsible for converting discrete data, such as words or tokens, into fixed-size continuous vector representations, also known as word embeddings or token embeddings. Words with similar meanings or related meanings are represented close to each other in the vector space, allowing the model to capture semantic relationships.
-
-**Prompt:** refers to the input or query provided to the model to elicit a response or generate text. The prompt is a text-based instruction or question that serves as a starting point for the LLM to generate a coherent and contextually relevant response.
-
-**Transfer learning** is a machine learning technique that reuses a pretrained model's knowledge on one task to enhance performance on a related task. It's a time-efficient way to improve models, particularly when data for the target task is limited.
-
-**Knowledge distillation:** a process where a smaller or more efficient model (the "student") is trained to replicate the behavior and predictions of a larger, more complex model (the "teacher"). Advantage: more computationally efficient and suitable for deployment in resource-constrained environments.
-
-**Quantization**: a process of reducing the precision or bit-width of numerical values, typically weights and activations, used in a neural network model. This involves representing the original high-precision floating-point values with a limited set of discrete values, often integers.
-It makes neural network models more memory-efficient and computationally faster during inference, without significantly compromising their performance.Quantization is particularly valuable for deploying models on resource-constrained devices like mobile phones, IoT devices, and edge computing devices.
-
-**Distributed learning**, also known as distributed machine learning, refers to the practice of training machine learning models across multiple computing devices or nodes in a distributed computing environment. The goal of distributed learning is to leverage the computational power and resources of multiple machines to accelerate the training process and handle larger datasets and more complex models.
-
-**Fine-tuning:** the process of adapting a pre-trained LLM to perform specific natural language processing (NLP) tasks or to generate text that aligns with particular criteria. It involves taking a well-established and pre-trained LLM, such as GPT-3 or BERT, and updating its parameters by training it on a smaller, task-specific dataset. Fine-tuning enables LLMs to excel in various NLP applications, including text classification, language translation, text summarization, and chatbot responses, by tailoring their capabilities to meet the requirements of a specific task or domain. This technique is especially valuable when you have limited labeled data for a particular application, as it leverages the pre-trained model's general language understanding while adapting it to the specifics of the target task.
-
-**Pruning:** is a technique used to remove certain connections (weights) or neurons from a neural network while retaining its general structure and functionality. These connections or neurons are often identified based on their low importance or contribution to the network's overall performance.
-
-- **Purpose:** The main purpose of pruning is to reduce the size of a neural network model, thereby decreasing memory usage and computational requirements during inference. Smaller models are more efficient and suitable for deployment on resource-constrained devices.
-
-- **Methods:** Various methods and criteria can be used for pruning, including magnitude-based pruning (removing small-weight connections), sensitivity analysis, and iterative pruning. Pruning can be applied to different layers or parts of a neural network, depending on the specific goals of model compression and optimization.
-
-**Sparsity:** refers to the property of having many of the elements or parameters in a neural network set to zero. A sparse neural network contains a significant portion of zero-valued parameters, resulting in a more compact representation.
-
-- **Purpose:** Introducing sparsity into a neural network is often an outcome of pruning, but it can also be achieved through other techniques. Sparse models consume less memory and require fewer computations, making them suitable for deployment in scenarios where computational resources are limited.
-
-- **Benefits:** Sparse neural networks can have improved inference speed, reduced memory footprint, and decreased energy consumption, which is advantageous for edge devices, mobile applications, and real-time systems.
-
-- **Sparsity Techniques:** Besides pruning, techniques like weight regularization with sparsity-inducing penalties (e.g., L1 regularization), structured sparsity, and quantization can be used to induce sparsity in neural network models.
-
-### Hallucination
-
-**Hallucination:** A phenomenon where the model generates text or responses that contain information or details that are not based on factual data or reality. It occurs when the model produces information that is fabricated, imagined, or fictional rather than being grounded in accurate information from its training data or the real world.
-
-**Causes of Hallucination:**
-
-    1. Biases in the training data.
-    2. Limitations in the model's understanding of context.
-    3. The model's tendency to generate creative but incorrect information.
-
-Researchers and developers continually work to mitigate and reduce hallucination in LLMs to make them more reliable and trustworthy in their outputs.
-
-## References
-- ["Large Languge model"] (https://www.analyticsvidhya.com/blog/2023/03 an-introduction-to-large-language-models-llms/)
-
-- ["Transformer architecture"] (https://machinelearningmastery.com/the-transformer-model/)
-
-- ["Tokenization"] (https://www.analyticsvidhya.com/blog/2020/05/what-is-tokenization-nlp/)
-
-- ["Embedding Layer"] (https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/)
-
-- ["Prompt:"](https://machinelearningmastery.com/a-gentle-introduction-to-prompt-engineering/)
-
-- ["Transfer learning"] (https://machinelearningmastery.com/transfer-learning-for-deep-learning/)
-
-["Distributed learning"](https://cs.brown.edu/people/acrotty/pubs/Galakatos2017_ReferenceWorkEntry_DistributedMachineLearning.pdf)
-
-- ["Fine-tuning"](https://intellipaat.com/blog/fine-tuning/)
-- 
\ No newline at end of file

From 90d6079c70432651715e528e42b89fa9c1c9d8e6 Mon Sep 17 00:00:00 2001
From: Aryan Pandey <92007507+Aryan8912@users.noreply.github.com>
Date: Thu, 30 Nov 2023 05:15:28 +0530
Subject: [PATCH 5/5] Delete
 src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py

---
 .../Vertex_AI_Google_API_based_fine_tuning.py | 75 -------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py

diff --git a/src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py b/src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py
deleted file mode 100644
index 2e209e7a..00000000
--- a/src/llm_vm/Vertex_AI_Google_API_based_fine_tuning.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from __future__ import annotations
-
-
-from typing import Optional
-
-
-from google.auth import default
-from google.cloud import aiplatform
-import pandas as pd
-import vertexai
-from vertexai.language_models import TextGenerationModel
-from vertexai.preview.language_models import TuningEvaluationSpec
-
-
-credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
-
-
-def tuning(
-    project_id: str,
-    location: str,
-    model_display_name: str,
-    training_data: pd.DataFrame | str,
-    train_steps: int = 10,
-    evaluation_dataset: Optional[str] = None,
-    tensorboard_instance_name: Optional[str] = None,
-) -> TextGenerationModel:
-    """Tune a new model, based on a prompt-response data.
-
-    "training_data" can be either the GCS URI of a file formatted in JSONL format
-    (for example: training_data=f'gs://{bucket}/{filename}.jsonl'), or a pandas
-    DataFrame. Each training example should be JSONL record with two keys, for
-    example:
-      {
-        "input_text": <input prompt>,
-        "output_text": <associated output>
-      },
-    or the pandas DataFame should contain two columns:
-      ['input_text', 'output_text']
-    with rows for each training example.
-
-    Args:
-      project_id: GCP Project ID, used to initialize vertexai
-      location: GCP Region, used to initialize vertexai
-      model_display_name: Customized Tuned LLM model name.
-      training_data: GCS URI of jsonl file or pandas dataframe of training data.
-      train_steps: Number of training steps to use when tuning the model.
-      evaluation_dataset: GCS URI of jsonl file of evaluation data.
-      tensorboard_instance_name: The full name of the existing Vertex AI TensorBoard instance:
-        projects/PROJECT_ID/locations/LOCATION_ID/tensorboards/TENSORBOARD_INSTANCE_ID
-        Note that this instance must be in the same region as your tuning job.
-    """
-    vertexai.init(project=project_id, location=location, credentials=credentials)
-    eval_spec = TuningEvaluationSpec(evaluation_data=evaluation_dataset)
-    eval_spec.tensorboard = aiplatform.Tensorboard(
-        tensorboard_name=tensorboard_instance_name
-    )
-    model = TextGenerationModel.from_pretrained("text-bison@001")
-
-    model.tune_model(
-        training_data=training_data,
-        # Optional:
-        model_display_name=model_display_name,
-        train_steps=train_steps,
-        tuning_job_location="europe-west4",
-        tuned_model_location=location,
-        tuning_evaluation_spec=eval_spec,
-    )
-
-    print(model._job.status)
-
-    return model
-
-
-if __name__ == "__main__":
-    tuning()
\ No newline at end of file