From d7eadd42f6961cdcf597876bb62bc90d11027af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= Date: Fri, 30 Aug 2024 14:32:33 +0200 Subject: [PATCH] update examples --- docs/quickstart/multiple-views.md | 1 + examples/multiple_views.py | 81 +++++++++++++++++++++++++++++++ examples/semantic_similarity.py | 39 +++++++++++++++ 3 files changed, 121 insertions(+) diff --git a/docs/quickstart/multiple-views.md b/docs/quickstart/multiple-views.md index e1783cc5..33cdc2d1 100644 --- a/docs/quickstart/multiple-views.md +++ b/docs/quickstart/multiple-views.md @@ -28,6 +28,7 @@ jobs_data = pd.DataFrame.from_records([ {"title": "Machine Learning Engineer", "company": "Company C", "location": "Berlin", "salary": 90000}, {"title": "Data Scientist", "company": "Company D", "location": "London", "salary": 110000}, {"title": "Data Scientist", "company": "Company E", "location": "Warsaw", "salary": 80000}, + {"title": "Data Scientist", "company": "Company F", "location": "Warsaw", "salary": 100000}, ]) ``` diff --git a/examples/multiple_views.py b/examples/multiple_views.py index 0644de73..a8b9423d 100644 --- a/examples/multiple_views.py +++ b/examples/multiple_views.py @@ -15,6 +15,7 @@ from dbally.embeddings.litellm import LiteLLMEmbeddingClient from dbally.llms.litellm import LiteLLM from dbally.similarity import FaissStore, SimilarityIndex, SimpleSqlAlchemyFetcher +from dbally.views.pandas_base import Aggregation, AggregationGroup engine = create_engine("sqlite:///examples/recruiting/data/candidates.db") @@ -76,6 +77,45 @@ def from_country(self, country: Annotated[str, country_similarity]) -> sqlalchem """ return Candidate.country == country + @decorators.view_aggregation() + def average_years_of_experience(self) -> sqlalchemy.Select: + """ + Calculates the average years of experience of candidates. + """ + return self.select.with_only_columns( + sqlalchemy.func.avg(Candidate.years_of_experience).label("average_years_of_experience") + ) + + @decorators.view_aggregation() + def positions_per_country(self) -> sqlalchemy.Select: + """ + Returns the number of candidates per position per country. + """ + return ( + self.select.with_only_columns( + sqlalchemy.func.count(Candidate.position).label("number_of_candidates"), + Candidate.position, + Candidate.country, + ) + .group_by(Candidate.position, Candidate.country) + .order_by(sqlalchemy.desc("number_of_candidates")) + ) + + @decorators.view_aggregation() + def top_universities(self, limit: int) -> sqlalchemy.Select: + """ + Returns the top universities by the number of candidates. + """ + return ( + self.select.with_only_columns( + sqlalchemy.func.count(Candidate.id).label("number_of_candidates"), + Candidate.university, + ) + .group_by(Candidate.university) + .order_by(sqlalchemy.desc("number_of_candidates")) + .limit(limit) + ) + jobs_data = pd.DataFrame.from_records( [ @@ -84,6 +124,7 @@ def from_country(self, country: Annotated[str, country_similarity]) -> sqlalchem {"title": "Machine Learning Engineer", "company": "Company C", "location": "Berlin", "salary": 90000}, {"title": "Data Scientist", "company": "Company D", "location": "London", "salary": 110000}, {"title": "Data Scientist", "company": "Company E", "location": "Warsaw", "salary": 80000}, + {"title": "Data Scientist", "company": "Company F", "location": "Warsaw", "salary": 100000}, ] ) @@ -114,6 +155,46 @@ def from_company(self, company: str) -> pd.Series: """ return self.df.company == company + @decorators.view_aggregation() + def average_salary(self) -> AggregationGroup: + """ + Calculates the average salary of job offers. + """ + return AggregationGroup( + aggregations=[ + Aggregation(column="salary", function="mean"), + ], + ) + + @decorators.view_aggregation() + def average_salary_per_location(self) -> AggregationGroup: + """ + Calculates the average salary of job offers per location and title. + """ + return AggregationGroup( + aggregations=[ + Aggregation(column="salary", function="mean"), + ], + groupbys=[ + "location", + "title", + ], + ) + + @decorators.view_aggregation() + def count_per_title(self) -> AggregationGroup: + """ + Counts the number of job offers per title. + """ + return AggregationGroup( + aggregations=[ + Aggregation(column="title", function="count"), + ], + groupbys=[ + "title", + ], + ) + def display_results(result: ExecutionResult): if result.view_name == "CandidateView": diff --git a/examples/semantic_similarity.py b/examples/semantic_similarity.py index b4a03b66..098f167a 100644 --- a/examples/semantic_similarity.py +++ b/examples/semantic_similarity.py @@ -76,6 +76,45 @@ def from_country(self, country: Annotated[str, country_similarity]) -> sqlalchem """ return Candidate.country == country + @decorators.view_aggregation() + def average_years_of_experience(self) -> sqlalchemy.Select: + """ + Calculates the average years of experience of candidates. + """ + return self.select.with_only_columns( + sqlalchemy.func.avg(Candidate.years_of_experience).label("average_years_of_experience") + ) + + @decorators.view_aggregation() + def positions_per_country(self) -> sqlalchemy.Select: + """ + Returns the number of candidates per position per country. + """ + return ( + self.select.with_only_columns( + sqlalchemy.func.count(Candidate.position).label("number_of_candidates"), + Candidate.position, + Candidate.country, + ) + .group_by(Candidate.position, Candidate.country) + .order_by(sqlalchemy.desc("number_of_candidates")) + ) + + @decorators.view_aggregation() + def top_universities(self, limit: int) -> sqlalchemy.Select: + """ + Returns the top universities by the number of candidates. + """ + return ( + self.select.with_only_columns( + sqlalchemy.func.count(Candidate.id).label("number_of_candidates"), + Candidate.university, + ) + .group_by(Candidate.university) + .order_by(sqlalchemy.desc("number_of_candidates")) + .limit(limit) + ) + async def main(): dbally.event_handlers = [CLIEventHandler()]