Skip to content

Commit

Permalink
Glove cpu (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
boranhan authored Oct 25, 2024
1 parent 1d4d0bd commit 8ecdde6
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 15 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"python-calamine",
"sentence-transformers>=3.1.0",
"tenacity>=8.2.2,<10.0",
"gensim>=4.3",
]

[project.optional-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions src/autogluon_assistant/task_inference/task_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def _chat_and_parse_prompt_output(self) -> Dict[str, str]:
"""Chat with the LLM and parse the output"""
try:
chat_prompt = self.prompt_generator.generate_chat_prompt()
logger.info(f"LLM chat_prompt:\n{chat_prompt.format_messages()}")
logger.debug(f"LLM chat_prompt:\n{chat_prompt.format_messages()}")
output = self.llm.invoke(chat_prompt.format_messages())
logger.info(f"LLM output:\n{output}")
logger.debug(f"LLM output:\n{output}")
parsed_output = self.parse_output(output)
except OutputParserException as e:
logger.error(f"Failed to parse output: {e}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import gensim.downloader as api
from gensim.utils import tokenize

from .base import BaseFeatureTransformer

Expand All @@ -25,34 +27,61 @@ def get_device_info():
return DeviceInfo(cpu_count, gpu_devices)


def _run_one_proc(model, data):
def huggingface_run(model, data):
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
data = np.where(pd.isna(data), "", data)
return model.encode(data).astype("float32")
else:
return np.zeros(len(data))


def glove_run_one_proc(model, data):
embeddings = []
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
for text in data:
token_list = list(tokenize(text))
embed = model.get_mean_vector(token_list)
embeddings.append(embed)
else:
return np.zeros(len(data))
return np.stack(embeddings).astype('float32')


class PretrainedEmbeddingTransformer(BaseFeatureTransformer):
def __init__(self, model_name, **kwargs) -> None:
self.model_name = model_name
try:
self.model = SentenceTransformer(self.model_name)
except:
logger.warning(f"No model {self.model_name} is found.")
if torch.cuda.is_available():
try:
self.model = SentenceTransformer(self.model_name)
except:
logger.warning(f"No model {self.model_name} is found.")

else:
logger.warning(f"Cuda is not found. For an optimized user experience, we switched to the glove embeddings")
self.model_name = "glove-twitter"
self.dim = 100
self.max_num_procs = 16
try:
self.model = api.load(f"{self.model_name}-{self.dim}")
except:
logger.warning(f"No model {self.model_name}-{self.dim} is found.")
self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count()))

def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None:
pass

def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
if not torch.cuda.is_available():
pass
assert (
train_X.columns.values.tolist() == test_X.columns.values.tolist()
), "The columns of the training set does not matach the columns of the test set"
train_X.columns.values.tolist() == test_X.columns.values.tolist()
), "The columns of the training set does not matach the columns of the test set"

for series_name in train_X.columns.values.tolist():
transformed_train_column = _run_one_proc(self.model, np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = _run_one_proc(self.model, np.transpose(test_X[series_name].to_numpy()).T)
if torch.cuda.is_available():
transformed_train_column = huggingface_run(self.model, np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = huggingface_run(self.model, np.transpose(test_X[series_name].to_numpy()).T)
else:
transformed_train_column = glove_run_one_proc(self.model, np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = glove_run_one_proc(self.model, np.transpose(test_X[series_name].to_numpy()).T)

if transformed_train_column.any() and transformed_test_column.any():
transformed_train_column = pd.DataFrame(transformed_train_column)
Expand All @@ -62,5 +91,5 @@ def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) ->
]
train_X = pd.concat([train_X.drop([series_name], axis=1), transformed_train_column], axis=1)
test_X = pd.concat([test_X.drop([series_name], axis=1), transformed_test_column], axis=1)

return train_X, test_X
return train_X, test_X

0 comments on commit 8ecdde6

Please sign in to comment.