From 350266cbedae7f91d611a6362592470aa246bb7d Mon Sep 17 00:00:00 2001 From: Stefan Bachhofner Date: Mon, 27 May 2024 20:43:31 +0200 Subject: [PATCH 01/48] doc(gems): pin rexml because it causes a security issue --- docs/Gemfile | 1 + docs/Gemfile.lock | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/Gemfile b/docs/Gemfile index 0fe407db..dc5b7658 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -6,6 +6,7 @@ source 'https://rubygems.org' # gem "jekyll", "~> 4.3.2" # installed by `gem jekyll` gem "just-the-docs", "0.7.0" # pinned to the current release +gem "rexml", "3.2.8" # pinned to that version to fix security alert # # Gems that are only loaded if they are configured correctly. diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index ebad891e..3dc199c8 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -75,17 +75,20 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.6) + rexml (3.2.8) + strscan (>= 3.0.9) rouge (4.1.3) ruby2_keywords (0.0.5) safe_yaml (1.0.5) - sass-embedded (1.69.5-arm64-darwin) + sass-embedded (1.69.5) google-protobuf (~> 3.23) - sass-embedded (1.69.5-x86_64-linux-gnu) + rake (>= 13.0.0) + sass-embedded (1.69.5-arm64-darwin) google-protobuf (~> 3.23) sawyer (0.9.2) addressable (>= 2.3.5) faraday (>= 0.17.3, < 3) + strscan (3.1.0) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) unicode-display_width (2.4.2) @@ -102,6 +105,7 @@ DEPENDENCIES jekyll-include-cache jekyll-seo-tag just-the-docs (= 0.7.0) + rexml (= 3.2.8) BUNDLED WITH 2.5.6 From e5ea6315663f3660b25183a0c85e35c750d18beb Mon Sep 17 00:00:00 2001 From: DARREN OBERST Date: Tue, 28 May 2024 13:25:41 -0400 Subject: [PATCH 02/48] adding question gen slim models to catalog with examples --- examples/Models/using-slim-q-gen.py | 137 +++++++ examples/Models/using-slim-qa-gen.py | 530 +++++++++++++++++++++++++++ llmware/model_configs.py | 131 ++++++- 3 files changed, 797 insertions(+), 1 deletion(-) create mode 100644 examples/Models/using-slim-q-gen.py create mode 100644 examples/Models/using-slim-qa-gen.py diff --git a/examples/Models/using-slim-q-gen.py b/examples/Models/using-slim-q-gen.py new file mode 100644 index 00000000..d0889836 --- /dev/null +++ b/examples/Models/using-slim-q-gen.py @@ -0,0 +1,137 @@ + +""" This example shows how to use the slim-q-gen models to automatically generate a question based on a +context passage. + + There are two 'q-gen' models - (a) tiny-llama base (1.1b), and (b) phi-3 base (3.8b) + + Both models work the same way with tiny-llama a little faster, and phi-3 a little higher quality. + + The models come packaged both as pytorch and gguf - for most inference use cases, we would recommend + the gguf versions which are considerably faster. + + We would recommend experimenting with the temperature settings to optimize varied and + creative question generations. + + Automated question generation has several use cases, including: + + -- quiz test question generation for education, enablement, self-training or testing + -- search retrieval tagging to add top questions to the search index (both text and semantic) + -- agent-oriented scenarios in which one model 'asks' the question, and another model 'answers' it + + models in catalog: + + -- "slim-q-gen-phi-3" + -- "slim-q-gen-phi-3-tool" + -- "slim-q-gen-tiny" + -- "slim-q-gen-tiny-tool" + + """ + +from llmware.models import ModelCatalog + + +def hello_world_test(source_passage, q_model="slim-q-gen-tiny-tool", number_of_tries=10, question_type="question", + temperature=0.5): + + """ Shows a basic example of generating questions from a text passage, running a number of inferences, + and then keeping only the unique questions generated. + + -- source_passage = text passage + -- number_of_tries = integer number of times to call the model to generate a question + -- question_type = "question" | "boolean" | "multiple choice" + + """ + + # recommend using temperature of 0.2 - 0.8 - for multiple choice, use lower end of the range + q_model = ModelCatalog().load_model(q_model, sample=True, temperature=temperature) + + questions = [] + + for x in range(0, number_of_tries): + + response = q_model.function_call(source_passage, params=[question_type], get_logits=False) + + # expect response in the form of: "llm_response": {"question": ["generated question?"] } + + if response: + if "llm_response" in response: + if "question" in response["llm_response"]: + new_q = response["llm_response"]["question"] + + # keep only new questions + if new_q not in questions: + questions.append(new_q) + + print(f"inference {x} - response: {response['llm_response']}") + + print(f"\nDe-duped list of questions created\n") + for i, question in enumerate(questions): + + print(f"new generated questions: {i} - {question}") + + return questions + + +def ask_and_answer_game(source_passage, q_model="slim-q-gen-tiny-tool", number_of_tries=10, question_type="question", + temperature=0.5): + + """ Shows a simple two model game of using q-gen model to generate a question, and then a second model + to answer the question generated. """ + + # this is the model that will generate the 'question' + q_model = ModelCatalog().load_model(q_model, sample=True, temperature=temperature) + + # this will be the model used to 'answer' the question + answer_model = ModelCatalog().load_model("bling-phi-3-gguf") + + questions = [] + + print(f"\nGenerating a set of questions automatically from the source passage.\n") + + for x in range(0,number_of_tries): + + response = q_model.function_call(source_passage, params=[question_type], get_logits=False) + + if response: + if "llm_response" in response: + if "question" in response["llm_response"]: + new_q = response["llm_response"]["question"] + + # only keep new questions + if new_q and new_q not in questions: + questions.append(new_q) + + print(f"inference - {x} - response: {response}") + + print("\nAnswering the generated questions\n") + for i, question in enumerate(questions): + + print(f"\nquestion: {i} - {question}") + if isinstance(question, list) and len(question) > 0: + response = answer_model.inference(question[0], add_context=test_passage) + print(f"response: ", response["llm_response"]) + + return True + + +if __name__ == "__main__": + + # test passage pulled from CNBC news story on Tuesday, May 28, 2024 + test_passage = ("OpenAI said Tuesday it has established a new committee to make recommendations to the " + "company’s board about safety and security, weeks after dissolving a team focused on AI safety. " + "In a blog post, OpenAI said the new committee would be led by CEO Sam Altman as well as " + "Bret Taylor, the company’s board chair, and board member Nicole Seligman. The announcement " + "follows the high-profile exit this month of an OpenAI executive focused on safety, " + "Jan Leike. Leike resigned from OpenAI leveling criticisms that the company had " + "under-invested in AI safety work and that tensions with OpenAI’s leadership had " + "reached a breaking point.") + + # first example + hello_world_test(test_passage,q_model="slim-q-gen-tiny-tool",number_of_tries=10, + question_type="question", + temperature=0.5) + + # second example + ask_and_answer_game(test_passage,q_model="slim-q-gen-phi-3-tool", number_of_tries=10, + question_type="question", + temperature=0.5) diff --git a/examples/Models/using-slim-qa-gen.py b/examples/Models/using-slim-qa-gen.py new file mode 100644 index 00000000..bba81b9c --- /dev/null +++ b/examples/Models/using-slim-qa-gen.py @@ -0,0 +1,530 @@ + +""" This example shows how to use the slim-qa-gen models to automatically generate a question and answer based on a +context passage. + + There are two 'qa-gen' models - (a) tiny-llama base (1.1b), and (b) phi-3 base (3.8b) + + Both models work the same way with tiny-llama a little faster, and phi-3 a little higher quality. + + The models come packaged both as pytorch and gguf - for most inference use cases, we would recommend + the gguf versions which are considerably faster. + + This example uses an earnings_release test set that was also included in using_slim_extract_model.py, primarily + because it generates an interesting set of questions and answers. Feel free to substitute your own source + test sets. + + In the example, we will show how to take the generated question-answer pairs and create a mini self-supervised + instruct dataset. + + models in catalog: + + -- "slim-qa-gen-phi-3" + -- "slim-qa-gen-phi-3-tool" + -- "slim-qa-gen-tiny" + -- "slim-qa-gen-tiny-tool" + + """ + +import json +import os +import random + +from llmware.models import ModelCatalog +from llmware.gguf_configs import GGUFConfigs +from llmware.configs import LLMWareConfig + + +def earning_releases_test_set(): + + earnings_releases = [ + + {"context": "Adobe shares tumbled as much as 11% in extended trading Thursday after the design software maker " + "issued strong fiscal first-quarter results but came up slightly short on quarterly revenue guidance. " + "Here’s how the company did, compared with estimates from analysts polled by LSEG, formerly known as Refinitiv: " + "Earnings per share: $4.48 adjusted vs. $4.38 expected Revenue: $5.18 billion vs. $5.14 billion expected " + "Adobe’s revenue grew 11% year over year in the quarter, which ended March 1, according to a statement. " + "Net income decreased to $620 million, or $1.36 per share, from $1.25 billion, or $2.71 per share, " + "in the same quarter a year ago. During the quarter, Adobe abandoned its $20 billion acquisition of " + "design software startup Figma after U.K. regulators found competitive concerns. The company paid " + "Figma a $1 billion termination fee."}, + + { + "context": "Dick’s Sporting Goods raised its dividend by 10% on Thursday as the company posted its largest sales " + "quarter in its history and projected another year of growth. The company’s shares jumped more than " + "15% in intraday trading. CEO Lauren Hobart said on an earnings call Thursday that Dick’s sales " + "growth came from bigger tickets — either higher prices or more expensive items — as its transactions " + "were flat. Many retailers benefited from a 53rd week in fiscal 2023, but Dick’s said it still broke " + "records during its fiscal fourth quarter even without those extra days. Here’s how the athletic " + "apparel retailer did compared with what Wall Street was anticipating, based on a survey of " + "analysts by LSEG, formerly known as Refinitiv: Earnings per share: $3.85 adjusted vs. $3.35 expected " + "Revenue: $3.88 billion vs. $3.80 billion expected The company’s reported net income for the three-month " + "period that ended Feb. 3 was $296 million, or $3.57 per share, compared with $236 million, or $2.60 a " + "share, a year earlier. Excluding one-time items related to impairment charges and inventory write-offs, " + "Dick’s reported earnings per share of $3.85. Sales rose to $3.88 billion, up about 8% from $3.60 billion " + "a year earlier. “With our industry-leading assortment and strong execution, we capped off the year " + "with an incredibly strong fourth quarter and holiday season,” Hobart said in a statement. “We are " + "guiding to another strong year in 2024. We plan to grow both our sales and earnings through " + "positive comps, higher merchandise margin and productivity gains,” she added. During the quarter, " + "same-store sales rose 2.8%, well ahead of the 0.8% lift that analysts had expected, according to " + "StreetAccount. “Growth in transactions” and market share gains drove the increase, said Executive " + "Chairman Ed Stack."}, + + {"context": "Comcast topped both revenue and profit estimates in the fourth quarter as it lost fewer broadband " + "subscribers than expected, and it raised its dividend 7%, the company said Thursday. " + "Here’s how Comcast performed, compared with estimates from analysts surveyed by LSEG, " + "formerly known as Refinitiv. Earnings per share: 84 cents adjusted vs. 79 cents expected " + "Revenue: $31.25 billion vs. $30.51 billion expected For the quarter ended Dec. 31, net " + "income rose 7.8% to $3.26 billion, or 81 cents a share, compared with $3.02 billion, or " + "70 cents a share, a year earlier. Revenue increased 2.3% compared with the prior-year period. " + "Adjusted earnings before interest, taxes, depreciation and amortization (EBITDA) was flat year " + "over year at about $8 billion. 'For the third consecutive year, we generated the highest revenue, " + "adjusted EBITDA and adjusted EPS in our company’s history', Comcast Chief Executive Officer Brian " + "Roberts said in a statement. 'We also reported the highest adjusted EBITDA on record at Theme Parks; " + "were the #1 studio in worldwide box office for the first time since 2015; and maintained Peacock’s " + "position as the fastest growing streamer in the U.S.'"}, + + {"context": "Dollar General forecast annual sales above Wall Street estimates on Thursday, banking on higher " + "demand from inflation-hit customers buying groceries and essentials from the discount retailer’s stores. " + "Shares of the company rose about 6% in early trading, after falling nearly 45% in 2023 on rising costs " + "and stiff competition from bigger retailers. But higher prices and borrowing costs have prompted " + "budget-conscious consumers to cook more meals at home, helping Dollar General record stronger " + "footfall at its outlets as shoppers hunt for lower-margin, needs-based goods, over pricier general " + "merchandise. “With customer traffic growth and market share gains during the quarter, we believe our " + "actions are resonating with customers,” CEO Todd Vasos said in a statement. Vasos’s strategy - to focus " + "on the basics, like more employee presence at stores, greater customer engagement and expanding " + "private-label brands - has helped stabilize Dollar General’s business. Over the last few quarters, " + "Dollar General and rival Dollar Tree have struggled with rising costs linked to their supply " + "chains, labor and raw materials, while facing tough competition from retailers like Walmart " + "and Chinese ecommerce platform Temu. Dollar Tree’s shares fell more than 15% on Wednesday, after it " + "forecast weak sales and profit for 2024 and laid out plans to shutter 970 of its Family Dollar " + "stores. “Dollar General has a much rosier outlook than Dollar Tree... Dollar Tree’s challenges " + "with Family Dollar were years in the making, while Dollar General has embarked on an aggressive " + "effort to add more frozen, refrigerated and fresh produce,” eMarketer senior analyst Zak Stambor said. " + "Dollar General forecast 2024 sales to grow between 6.0% and 6.7%, above analysts’ estimate of 4.4% " + "growth to $40.33 billion, according to LSEG data. It still sees annual per-share profit between " + "$6.80 and $7.55, compared with estimates of $7.55. Its fourth-quarter net sales of $9.86 billion " + "surpassed estimates of $9.78 billion. It also reported an estimate-beating profit of $1.83 per share."}, + + { + "context": "Shares of Zara owner Inditex hit record highs on Wednesday according to LSEG data, climbing over 6% during " + "intraday trading after the company announced its 2023 full-year results. As of 11:50 London time, shared " + "were just over 6% higher at 43.58 euros, or $47.69. Sales increased 10.4% to 35.9 billion euros for the " + "year, the company said, signaling this was a record high. Sales grew across all geographic regions and " + "across Inditex’s brands and were “very satisfactory,” both online and in store, the company said. A total of " + "5,692 stores were operational at the end of the year, Inditex said, adding it plans to expand further in " + "2024, including with Zara shops in Los Angeles and Las Vegas. The company also plans to open new distribution " + "centers in 2024 and 2025, as part of a major logistics expansion plan that will cost the company " + "investments of 900 million euros in both years. Net income also reached a fresh high after soaring " + "30.3% from 2022 to reach 5.4 billion euros last year. The company’s gross profit came in at 20.8 billion " + "euros, up 11.9% on the year. “Inditex’s performance in 2023 has been excellent. Our teams have been able to " + "take advantage of the opportunities to keep growing profitably. We are investing to drive future growth and " + "continue to offer an attractive remuneration to shareholders,” Inditex CEO Oscar García Maceiras said in a " + "statement. The Spanish clothing company owns a range of vastly popular brands including household name " + "Zara, as well as Pull & Bear, Bershka, Stradivarius, premium retailer Massimo Dutti and sports and the " + "athleisure-focused Oysho. Zara, including the Zara Home range, was the biggest contributor to sales in " + "2023, followed by Pull & Bear and Massimo Dutti, Inditex said Wednesday. The company also indicated " + "that 2024 was off to a strong start, with sales in constant currency up 11% over the Feb. 1 to March 11 " + "stretch, compared with the same period a year earlier."}, + + { + "context": "Oracle reported quarterly earnings on Monday that exceeded Wall Street’s expectations. Shares rose " + "13% in extended trading. Here’s how the company did in the fiscal third quarter ending Feb. 29, compared " + "to estimates by LSEG, formerly known as Refinitiv: Earnings per share: $1.41 adjusted vs. $1.38 expected " + "Revenue: $13.28 billion vs. $13.3 billion expected For the fiscal fourth quarter, Oracle said it expects " + "earnings of $1.62 to $1.66 per share. Analysts were expecting $1.64 in adjusted earnings per share, according " + "to LSEG. Revenue growth will be between 4% and 6% over sales of $13.8 billion a year ago. The midpoint of that " + "range would equal revenue of about $14.5 billion, while analysts were expecting a little more than $14.7 billion. " + "Oracle CEO Safra Catz said the company was committed to hitting previously stated goals of $65 billion in " + "sales by fiscal 2026. “Some of these goals might prove to be too conservative given our momentum,” Catz said. " + "Revenue rose 7% in the quarter from $12.4 billion a year earlier. Net income climbed 27% to $2.4 billion, " + "or 85 cents per share, from $1.9 billion, or 68 cents per share, a year ago. Oracle’s cloud services and " + "license support segment, its largest business, saw sales rise 12% to $9.96 billion, slightly beating " + "StreetAccount consensus expectations of $9.94 billion. The company attributed the rise to strong demand " + "for its artificial intelligence servers. Catz said the company added several “large new cloud " + "infrastructure” contracts during the quarter. The company’s cloud revenue, which is reported as part " + "of the cloud services unit, rose 25% year over year to $5.1 billion, Oracle said. “We signed several large " + "deals this quarter and we have many more in the pipeline,” Catz told investors on the earnings call. " + "Oracle Chairman Larry Ellison cited increased business from Microsoft on the earnings call. " + "“We’re building 20 data centers from Microsoft and Azure. They just ordered three more data centers " + "this week,” Ellison said. The company’s other units didn’t fare as well. Cloud license and on-premise sales " + "declined 3% to $1.26 billion, slightly beating StreetAccount’s forecast. Hardware revenue fell 7% to " + "$754 million, while sales in the company’s services division slid 5% to $1.31 billion, both falling short " + "of StreetAccount expectations. Prior to Monday’s report, Oracle shares were up 8.7% for the year, " + "slightly outperforming the S&P 500."}, + + { + "context": "Porsche on Tuesday warned that profitability will decline this year as it launches new models amid " + "tough economic conditions, but hiked its dividend on the back of a rise in 2023 operating profit. The German " + "luxury automaker said it expects an operating return on sales of between 15% and 17% in 2024, down from the " + "18% margin notched in 2023 and 2022. In the long term, the group targets an operating return on sales of more " + "than 20%. Explaining the more cautious profitability outlook, the company cited “the comprehensive renewal " + "of its product range in 2024, the global framework conditions, higher depreciations on capitalized " + "development costs and the continued investments in the brand and the Porsche ecosystem.” The company’s shares " + "were around 4.8% higher by early afternoon, having reversed opening losses of more than 2%. Porsche is " + "launching four new car ranges in 2024 in the form of the Panamera, Macan, Taycan and 911 model lines. " + "Porsche CFO: Expect significant growth in high-net-worth individuals in China WATCH NOW VIDEO 03:17 " + "Porsche CFO: Expect significant growth in high-net-worth individuals in China “2024 is going to be a year of " + "product launches for Porsche – more so than any year in our history,” Chairman Oliver Blume said in a " + "statement. “We will be introducing a variety of exhilarating sports cars to the road, they will delight " + "our customers around the world. This will put the wind at our back for years to come.”"}, + + { + "context": "Lego on Tuesday reported its full-year 2023 results, saying it’s revenue grew by 2% throughout the year, " + "in line with expectations. The company made “very, very strong progress” and “grew comfortably” in the " + "U.S., its CEO Niels Christiansen told CNBC. The toy industry has been struggling to maintain " + "pandemic-era growth as inflation is putting pressure on demand and sales. In-store participation is greater " + "than prior to the pandemic, Lego CEO says In-store participation is greater than prior " + "to the pandemic, Lego CEO says The chief executive of Denmark’s Lego on Tuesday reflected on a tough year " + "for the world’s largest toymaker, and outlined the firm’s long-term plans to stay relevant and “cool with kids. ”" + "Lego said its 2023 revenue was 2% higher compared to the previous year, growing to 65.9 billion Danish krone " + "(around $9.65 billion). This was in line with expectations, Lego said in a statement. “It was a difficult year,” " + "Lego CEO Niels Christiansen told CNBC. However, he said the company had “managed to take quite a bit of " + "market share.” The Danish toymaker said operating profit declined slightly from 17.9 billion Danish krone " + "to 17.1 billion, noting that it had boosted spending on strategic initiatives designed to drive growth. " + "Net profit came in at 13.1 billion Danish krone in 2023, compared to 13.8 billion the previous year. " + "Consumer sales were up 4% despite slumping in China, Lego said, attributing the growth to increasing demand " + "in the U.S. and central and eastern Europe. It comes as the wider toy industry has been struggling to " + "maintain growth after booming during the coronavirus pandemic, when parents looked for new ways to " + "entertain their children and adults re-discovered childhood pastimes. Toy company Hasbro earlier this month " + "said its 2023 revenue fell by 15% compared to 2022 and that it expected to see a further decline this year."}, + + { + "context": "Adidas on Wednesday warned of a sales decline in its overstocked North American market in 2024, as the " + "German sportswear brand continues to sell off its remaining Yeezy inventory. Currency-neutral sales in " + "North America are expected to decline to a mid-single-digit rate in 2024, but are projected to notch " + "mid-single-digit growth worldwide despite persistent “macroeconomic challenges and geopolitical tensions,” " + "the company said. Adidas confirmed its 2023 operating profit came in at 268 million euros ($292.9 million) " + "on the back of flat currency-neutral sales, significantly above prior expectations as the company continues " + "to take a hit from the cessation of its line of Yeezy — footwear the retailer produced in a collaboration with " + "American rapper Ye, formerly known as Kanye West. For the fourth quarter, the company posted an operating " + "loss of 377 million euros. The board proposed a flat dividend of 0.70 euros per share. “Although by far not " + "good enough, 2023 ended better than what I had expected at the beginning of the year,” CEO Bjørn Gulden " + "said in a statement. “Despite losing a lot of Yeezy revenue and a very conservative sell-in strategy, " + "we managed to have flat revenues. We expected to have a substantial negative operating result, but " + "achieved an operating profit of €268 million.” Adidas was confirming preliminary results released in late " + "January, when it announced that it would not write off the majority of its Yeezy inventory and would instead " + "sell off the remaining shoes at cost. The sportswear giant was forced to axe the Yeezy line after terminating " + "its partnership with Ye over a string of anti-Semitic remarks that the rapper made in 2022. Adidas said the " + "discontinuation of Yeezy represented a drag of around 500 million euros in the year-on-year comparison " + "through 2023, though the sale of parts of the remaining inventory in the second and third quarter positively " + "impacted net sales by around 750 million euros. “With a very disciplined go-to-market and buying process, " + "we reduced our inventories by almost €1.5 billion. With the exception of the U.S., we now have healthy " + "inventories everywhere,” Gulden said. He added that the company is expecting some growth in the " + "first quarter of 2024 and a further pick-up in the second half of the year. “We still have a lot of work " + "to do, but I feel very confident we are on the right track. We will bring adidas back again. Give us some " + "time and we will again say – we got this!” he said. Adidas projected an operating profit of around " + "500 million euros in 2024, with unfavorable currency effects expected to “weigh significantly on the " + "company’s profitability” because of adverse impacts on both reported revenues and gross margin development." + "Adidas shares were flat by mid-morning on Wednesday. Mamta Valechha, equity research analyst at " + "Quilter Cheviot, said that, given that the headline numbers were already pre-released in January, the most " + "interesting aspect of Wednesday’s report was the “clear acceleration of the Adidas brand.”"}, + + { + "context": "Costco on Thursday missed Wall Street’s revenue expectations for its holiday quarter, despite reporting " + "year-over-year sales growth and strong e-commerce gains. Shares of the retailer fell about 4% in aftermarket " + "trading. The company’s stock had hit a 52-week high earlier in the day. Here’s what Costco reported for its " + "fiscal second quarter of 2024 compared with what Wall Street was expecting, based on a survey of analysts by " + "LSEG, formerly known as Refinitiv: Earnings per share: $3.92 vs. $3.62 expected Revenue: $58.44 billion vs. " + "$59.16 billion expected In the three-month period that ended Feb. 18, Costco’s net income rose to " + "$1.74 billion, or $3.92 per share, compared with $1.47 billion, or $3.30 per share, a year earlier. " + "Costco’s revenue for the quarter increased from $55.27 billion in the year-ago period. Comparable sales for " + "the company increased 5.6% year over year and 4.3% in the U.S. Excluding changes in gas prices and foreign " + "currency, the metric increased 5.8% overall and 4.8% in the U.S. Sales of food and sundries, a category " + "that includes snack foods and beverages, were up by mid single digits in the quarter, CFO Richard Galanti " + "said on the company’s earnings call. Fresh foods were up high single digits and nonfoods were up mid single " + "digits. Ancillary businesses, which includes more service-related purchases like travel, were up by low " + "single digits, he said. Costco’s food court, pharmacy and optical centers were top performers in the quarter " + "and gas was down low single digits as the price per gallon fell. More shoppers came to Costco, and they " + "spent more on their shopping trips during the quarter. Traffic increased 5.3% across the globe and 4.3% in " + "the U.S., Galanti said on the earnings call. The average ticket increased in the U.S. and worldwide, he " + "said. Inflation was roughly flat year over year in the quarter, which allowed the retailer to reduce " + "prices for some items, Galanti said. For example, he said, it’s been able to cut the price of reading " + "glasses from $18.99 to $16.99 and slash the price of a 48 count of Kirkland Signature batteries from " + "$17.99 to $15.99. In the prior quarter, he said inflation was as much as 1% year over year. Galanti said many " + "new items in categories like sporting goods and lawn and garden will also have lower prices compared with " + "a year ago because of falling freight and commodity costs. Costco has 875 warehouses, including 603 in " + "the U.S. and Puerto Rico. It also has clubs in about a dozen other countries, including Canada, Mexico, " + "Japan and China. In the second quarter, Costco opened four new clubs, including three in the U.S. " + "and one in Shenzhen, China. That marked its sixth club to open in China, Galanti said. Two of the three " + "new U.S. locations were Costco Business Centers, which are specifically geared toward small business " + "owners like restaurant operators. As of Thursday’s close, Costco shares have risen nearly 19% since the " + "start of the year. The stock touched a 52-week high of $787.08 earlier in the day and closed at $785.59, " + "bringing the company’s market value to nearly $350 billion."}, + + { + "context": "Shares of Teleperformance plunged 23% on Thursday, after the French call center and office services group " + "missed its full-year revenue target and flagged a “volatile economic environment.” Investors have been " + "spooked by the potential impact of artificial intelligence on its business model, as companies become more " + "able to tap into the technology directly for their own benefit. Teleperformance shares dropped 16% last " + "week, according to LSEG data, after Swedish financial services company Klarna said its Open AI-powered " + "customer service assistant was handling two-thirds of customer service calls. But Teleperformance CEO " + "Daniel Julien on Thursday said that AI would be a positive for its business model — and that it will never " + "fully replace the value of human interaction. “AI is part of the solutions we provide to the clients,” " + "Julien told CNBC’s “Squawk Box Europe.” “AI helps to increase the accuracy of our employees ... which is " + "great, but at the end of the day we are here to reduce the friction between the citizens, or the customer, " + "and the companies they have bought a product and service from.” He stressed, “It’s not just a transactional " + "relationship, it has a lot to do with reassuring, with trust, with empathy. So we perceive AI as enhancing " + "the job that our human employees do, but absolutely not replacing them.” hide content Teleperformance SE " + "RT Quote | Exchange | EUR 87.16 quote price arrow up+0.68 (+0.79%) Last | 03/15/24 CET WATCHLIST + QUOTE DETAILS " + "Teleperformance share price. Teleperformance reported 2.3% higher revenue at 8.345 billion euros " + "($9.091 billion) in 2023, as net profit fell year-on-year from 643 million euros to 602 million euros. " + "Diluted earnings per share hit 10.18 euros, down from 10.77 euros. In its results, the company said it is " + "working with clients on 250 AI projects, including in generative AI, and it has expanded its portfolio " + "with new partnerships in the space. “Even the most high-tech or the most AI-involved companies are clients " + "of Teleperformance. We chose that there is a complementarity and not separation,” Julien told CNBC, " + "flagging the company’s agreement with tech giant and major AI player Microsoft. “They are there to provide " + "a solution that is going to augment the productivity, augment the quality of the information that can be " + "given to the customer, but, at the end of the day, the customer is a human being. The day the customer is " + "going to be a robot, maybe AI will replace the humans.”"}, + + {"context": "CrowdStrike shares surged as much as 21% in after-hours trading Tuesday after the cybersecurity " + "company reported a beat on the top and bottom lines, plus issued stronger-than-expected guidance for " + "the upcoming quarter and full year. Here’s how the company did compared to consensus estimates " + "based on a survey of analysts by LSEG, formerly known as Refinitiv: Earnings per share: 95 cents " + "adjusted vs. 82 cents expected Revenue: $845 million vs. $839 million expected For the period that " + "ended Jan. 31, CrowdStrike saw net income of $54 million, or 22 cents per share, from a " + "$48 million loss, or a 20 cent loss per share, in the year-ago period. CrowdStrike has now " + "reported GAAP net income for the past four quarters, Chief Financial Officer Burt Podbere " + "said in the earnings release. Full-year revenue rose 36% year over year, from $2.24 billion " + "to $3 billion. The company also announced it would acquire Flow Security for an undisclosed " + "price in a cash-and-stock deal, slated to close in the company’s fiscal first quarter. " + "The company has been stepping up its merger and acquisition activity in recent months. “CrowdStrike " + "is cybersecurity’s consolidator of choice, innovator of choice, and platform of choice to " + "stop breaches,” co-founder and CEO George Kurtz said in a release. The company also guided to " + "fiscal first-quarter revenue between $902 million and $906 million, better than a consensus " + "estimate of $899 million. CrowdStrike also expects earnings per share for the period between " + "89 cents and 90 cents, better than the consensus estimate of 82 cents. Podbere also reiterated " + "the company’s focus on achieving $10 billion in annual recurring revenue by 2030. The company " + "reached $3.4 billion in annual recurring revenue in January."}, + + {"context": "Shares of Amer Sports, the maker of Wilson tennis rackets and Lousiville Slugger baseball " + "bats, fell on Tuesday after the company reported strong sales in China but a slowdown in " + "wholesale orders. Here’s how the newly public athletic company did in its fourth quarter. " + "CNBC didn’t compare the results to Wall Street estimates because it’s the first earnings " + "report since Amer Sports went public. Loss per share: 25 cents Revenue: $1.32 billion In the " + "three months ended Dec. 31, the company reported a net loss of $94.9 million, or 25 cents " + "per share, compared with $148.3 million, or 39 cents per share, a year earlier. Sales rose to " + "$1.32 billion, up about 10% from $1.2 billion a year earlier. Shares closed about 5% lower. " + "Amer, which also owns Arc’teryx, Salomon, and a number of other athletic equipment and " + "apparel brands, operates in three distinct business segments. They are technical apparel, " + "which includes its pricey Arc’teryx winter jackets; outdoor performance, such as Salomon’s " + "winter sports equipment; and ball and racquet sports, which includes equipment and apparel " + "from Wilson and Louisville, among others. During the quarter, sales for Amer’s technical " + "apparel rose 26% year over year to $550 million, driven by a 42% jump in direct sales. " + "Sales in the segment primarily come from shoppers who are buying directly from Amer’s " + "brands rather than from wholesale partners. Sales for outdoor performance increased 2% to " + "$523 million, driven by strength in the segment’s winter sports equipment franchise, " + "which was offset by a slowdown in wholesale orders for Salomon footwear. Ball and racquet sales " + "declined 3% to $242 million as the segment lapped tougher comps. In the year-ago period, " + "retailers were still dealing with supply chain issues and had over-ordered equipment like " + "tennis rackets and baseball bats. As they looked to keep their inventory levels in check, " + "some wholesalers pulled back on orders during the quarter, but Amer expects the segment " + "will level out in the quarters ahead and end fiscal 2024 with sales up in the low- to " + "mid-single digit range. The company started trading on the New York Stock Exchange last " + "month under the ticker “AS.” The shares rose just 3% in Amer’s debut on the public " + "markets after it priced its IPO at a discount. Sellers showed muted interest in the " + "stock during its first day of trading over concerns about its connections and exposure to " + "China and its debt-laden balance sheet. Founded in Helsinki in 1950, Amer was a Finnish public " + "company until it was taken private in 2019 by a consortium of investors led by China’s Anta " + "Sports, FountainVest Partners, Anamered Investments and Tencent. Since the acquisition, " + "sales grew about 45% from $2.45 billion in 2020 to $3.55 billion in 2022. Revenue jumped " + "again in 2023 to $4.37 billion, the company said Tuesday."}, + + { + "context": "Shares of Dell Technologies popped more than 15% during extended trading Thursday after the company " + "released fourth-quarter results that beat analysts’ estimates and showed strong demand for its " + "artificial intelligence servers. Here’s how the company did: Earnings per share: $2.20 adjusted vs. " + "$1.73 expected by LSEG, formerly known as Refinitiv Revenue: $22.32 billion vs. $22.16 billion " + "expected by LSEG Dell’s revenue for the fiscal 2024 fourth quarter fell 11% from $25.04 billion " + "in the year-ago quarter. The company reported net income $1.16 billion, up 89% from the $614 million " + "it posted in the same period last year. Chief Financial Officer Yvonne McGill said in a release " + "that the company is increasing its annual dividend by 20% to $1.78 per share, which she called " + "a “testament to our confidence in the business.” Dell’s Infrastructure Solutions Group (ISG) " + "reported $9.3 billion in revenue for the quarter, down 6% year over year but up 10% from the " + "third quarter. Servers and networking revenue made up the bulk of that, with $4.9 billion in " + "revenue driven by “AI-optimized servers.” Storage revenue came in at $4.5 billion. The company’s " + "Client Solutions Group (CSG) reported $11.7 billion for the quarter, down 12% year over year. " + "That includes $9.6 billion in commercial client revenue, which fell 11% since the fourth quarter " + "of last year, and $2.2 billion in consumer revenue, down 19% year over year. “Our strong AI-optimized " + "server momentum continues, with orders increasing nearly 40% sequentially and backlog nearly " + "doubling, exiting our fiscal year at $2.9 billion,” Chief Operating Officer Jeff Clarke " + "said in the release. For its first quarter, Dell said during its quarterly call with " + "investors that it expects to report revenue between $21 billion and $22 billion. The company " + "said it is encouraged by momentum around AI, and that it expects to return to growth for " + "fiscal 2025. However, the company noted that the macroeconomic environment is causing some " + "customers to be cautious about infrastructure costs."}, + + {"context": "Birkenstock on Thursday beat holiday quarter revenue expectations, reporting a 22% year-on-year " + "jump, as the German sandal company benefited from higher pricing and rising U.S. demand. As a newly " + "public company, Birkenstock is still getting into a public reporting rhythm and only just " + "released its fiscal 2023 results and 2024 guidance a little over a month ago. On Thursday, " + "it said it stands by guidance issued then and still expects sales to be between 1.74 billion " + "euros and 1.76 billion euros ($1.89 billion and $1.91 billion), representing growth of 17% to 18%. " + "The shoemaker, which started trading on the New York Stock Exchange under the ticker “BIRK” in " + "October, saw a muted debut when it first hit the public markets, with shares sliding more than " + "12% on its first day as a public company. The stock has since rebounded and is up more than 5% " + "this year, as of the Wednesday close. Birkenstock’s shares closed more than 2% lower Thursday. " + "Here’s how the shoemaker did in its fiscal first quarter compared with what Wall Street was " + "anticipating, based on a survey of analysts by LSEG, formerly known as Refinitiv: Earnings per " + "share: 9 euro cents adjusted vs. 9 euro cents expected. Revenue: 302.9 million euros vs. 288.7 " + "million euros expected. The company reported a net loss of 7.15 million euros for the " + "three-month period that ended Dec. 31, or a loss of 4 euro cents per share. A year earlier, " + "it reported a loss of 9.19 million euros, or a loss of 5 euro cents per share. " + "Excluding one-time items, Birkenstock reported a profit of 17 million euros, or " + "9 euro cents per share. Sales rose to 302.9 million euros, up 22% from 248.5 million euros " + "a year earlier. Adjusted earnings before interest, taxation, depreciation and amortization " + "(EBITDA) rose 12% year on year to 81 million euros, with an adjusted EBITDA margin of 26.9%, " + "down from 29.1% a year earlier. The retailer has been making strides to grow its direct-to-consumer " + "business, which comes with better profits and more customer insights than relying on wholesale partners. " + "CEO Oliver Reichert has said the company deliberately engineers its distribution strategy so " + "demand is higher than supply but it’s working to double its production capabilities over " + "the next three years to narrow that gap. The chief executive said those investments, " + "along with other efforts the company is undertaking to drive growth, is having a “planned” " + "but “temporary” impact to profitability. The company’s gross profit margin inched down to " + "61% from 61.7% during the same period last year, with Birkenstock citing “unfavorable " + "currency translation and the planned, temporary under-absorption from our ongoing " + "capacity expansion.” The company said it continues to carefully track input costs and " + "is mitigating inflationary pressures with “executed, selective price increases.” In Europe, the " + "company said it had “two consecutive price adjustments” with “no signs of rejection.”"}, + + {"context": "Best Buy surpassed Wall Street’s revenue and earnings expectations for the holiday quarter on " + "Thursday, even as the company navigated through a period of tepid consumer electronics demand. " + "But the retailer warned of another year of softer sales and said it would lay off workers and " + "cut other costs across the business. CEO Corie Barry offered few specifics, but said the " + "company has to make sure its workforce and stores match customers’ changing shopping habits. " + "Cuts will free up capital to invest back into the business and in newer areas, such as artificial " + "intelligence, she added. “This is giving us some of that space to be able to reinvest into " + "our future and make sure we feel like we are really well positioned for the industry to " + "start to rebound,” she said on a call with reporters. For this fiscal year, Best Buy anticipates " + "revenue will range from $41.3 billion to $42.6 billion. That would mark a drop from the most " + "recently ended fiscal year, when full-year revenue totaled $43.45 billion. It said comparable " + "sales will range from flat to a 3% decline. The retailer plans to close 10 to 15 stores " + "this year after shuttering 24 in the past fiscal year. One challenge that will affect sales " + "in the year ahead: it is a week shorter. Best Buy said the extra week in the past fiscal " + "year lifted revenue by about $735 million and boosted diluted earnings per share by about " + "30 cents. Shares of Best Buy closed more than 1% higher Thursday after briefly touching " + "a 52-week high of $86.11 earlier in the session. Here’s what the consumer electronics " + "retailer reported for its fiscal fourth quarter of 2024 compared with what Wall Street was " + "expecting, based on a survey of analysts by LSEG, formerly known as Refinitiv: " + "Earnings per share: $2.72, adjusted vs. $2.52 expected Revenue: $14.65 billion vs. $14.56 " + "billion expected A dip in demand, but a better-than-feared holiday Best Buy has dealt " + "with slower demand in part due to the strength of its sales during the pandemic. Like " + "home improvement companies, Best Buy saw outsized spending as shoppers were stuck at " + "home. Plus, many items that the retailer sells like laptops, refrigerators and home " + "theater systems tend to be pricier and less frequent purchases. The retailer has cited other " + "challenges, too: Shoppers have been choosier about making big purchases while dealing " + "with inflation-driven higher prices of food and more. Plus, they’ve returned to " + "splitting their dollars between services and goods after pandemic years of little " + "activity. Even so, Best Buy put up a holiday quarter that was better than feared. " + "In the three-month period that ended Feb. 3, the company’s net income fell by 7% to " + "$460 million, or $2.12 per share, from $495 million, or $2.23 per share in the year-ago " + "period. Revenue dropped from $14.74 billion a year earlier. Comparable sales, a metric that " + "includes sales online and at stores open at least 14 months, declined 4.8% during the " + "quarter as shoppers bought fewer appliances, mobile phones, tablets and home theater " + "setups than the year-ago period. Gaming, on the other hand, was a strong sales " + "category in the holiday quarter."} + + ] + + return earnings_releases + + +def hello_world_test(test_set, qa_model="slim-qa-gen-tiny-tool", question_type="question", temperature=0.5): + + """ Shows a basic example of generating questions and answers from a test set, and then builds a simple mini + illustrative 'model-ready' instruct database built without any manual labeling or tagging. + + -- test_set = list of dictionary entries with a single key "context" associated with the text passage + in each test entry + + -- qa_model = name of the qa gen model selected + + -- question_type = "question" | "boolean" | "multiple choice" + + -- temperature = experiment with different levels to optimize balance and variety + + """ + + # recommend using temperature of 0.2 - 0.8 - for multiple choice, use lower end of the range + + # note: if generation is very long (e.g., a summary question), then it is possible that the + # output will be malformed given the cut-off at 500 tokens - in this case, an automated remediation + # will try to resolve, but in many cases will provide an empty [] response - we have also observed + # some repetition in very long generations too + + qa_model = ModelCatalog().load_model(qa_model, sample=True, temperature=temperature, max_output=500, + get_logits=False) + + qa_pair_set = [] + ds = [] + + print(f"\nRun Q-A Gen Inferences on Test Set") + + for text_passage in test_set: + + response = qa_model.function_call(text_passage["context"], params=[question_type], get_logits=False) + + # expect response in the form of: + # -- "llm_response": {'question': ['generated question?'], 'answer': ['answer to question']} + + if response: + if "llm_response" in response: + if "question" in response["llm_response"] and "answer" in response["llm_response"]: + + # get the question and answer from the llm response + new_q = response["llm_response"]["question"] + new_a = response["llm_response"]["answer"] + + # keep only where there is both a non-empty question and answer + if new_q and new_a: + qa_pair_set.append({"question": new_q[0], "answer": new_a[0]}) + ds.append(({"question": new_q[0], "answer": new_a[0], "context": text_passage["context"]})) + + print(f"inference - response: ", response) + + print("\nShow list of question-answer pairs created") + + for i, qa_pair in enumerate(qa_pair_set): + + print(f"new generated question-answer pairs: {i} - {qa_pair}") + + print("\nBuild model-ready 'mini' instruct dataset") + + # use phi-3 instruct wrapper template as separators between elements + # --easy to substitute these separators for other popular templates + sep1= "<|user|>\n" + sep2 = "<|end|>\n" + sep3 = "<|assistant|>\n" + + model_ready_ds = [] + for i, sample in enumerate(ds): + new_sample = sep1 + sample["context"] + "\n" + sample["question"] + sep2 + sep3 + sample["answer"] + model_ready_ds.append({"text": new_sample, "source": "earnings_test_set_example"}) + + random.shuffle(model_ready_ds) + fp = os.path.join(LLMWareConfig().get_llmware_path(), "instruct_ds_example.jsonl") + + train_file = open(fp, "w") + for rows in model_ready_ds: + jsonl_row = json.dumps(rows) + train_file.write(jsonl_row) + train_file.write("\n") + + train_file.close() + + print(f"\ncreated dataset @: {fp}") + + return qa_pair_set + + +if __name__ == "__main__": + + # get the earnings release test set (list of dicts with "context" key) + test_set = earning_releases_test_set() + + # set a max generation on the GGUF engine at 1000 tokens - each model can be loaded up to this amount + GGUFConfigs().set_config("max_output_tokens", 1000) + + # run the main example + hello_world_test(test_set,qa_model="slim-qa-gen-tiny-tool",question_type="question", temperature=0.5) + diff --git a/llmware/model_configs.py b/llmware/model_configs.py index b57810a5..23342d1e 100644 --- a/llmware/model_configs.py +++ b/llmware/model_configs.py @@ -1112,7 +1112,136 @@ "function_call": True, "primary_keys": ["key points (3)"], "fc_output_values": [], "tokenizer": "llmware/slim-extract", "tokenizer_local": "tokenizer_stablelm.json", - "marker_tokens": [], "marker_token_lookup": {}, "function": ["summarize"], "snapshot": True} + "marker_tokens": [], "marker_token_lookup": {}, "function": ["summarize"], "snapshot": True}, + + # adding new slim q-gen models + {"model_name": "slim-q-gen-phi-3-tool", "display_name": "slim-q-gen-tool", + "model_family": "GGUFGenerativeModel", "model_category": "generative_local", "model_location": "llmware_repo", + "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.3, "sample_default": True, "trailing_space": "", + "gguf_file": "q_gen.gguf", + "gguf_repo": "llmware/slim-q-gen-phi-3-tool", + "link": "https://huggingface.co/llmware/slim-q-gen-phi-3-tool", + "custom_model_files": [], "custom_model_repo": "", + "output_type": "dict", + "function_call": True, + "primary_keys": ["question"], + "fc_output_values": [], + "tokenizer": "microsoft/Phi-3-mini-4k-instruct", + "tokenizer_local": "tokenizer_phi3.json", + "marker_tokens": [], "marker_token_lookup": {}, + "function": ["generate"], + "snapshot": True}, + + {"model_name": "slim-q-gen-tiny-tool", "display_name": "llmware/slim-q-gen-tiny-tool", + "model_family": "GGUFGenerativeModel", "model_category": "generative_local", "model_location": "llmware_repo", + "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.5, "sample_default": True, "trailing_space": "", + "gguf_file": "q_gen.gguf", + "gguf_repo": "llmware/slim-q-gen-tiny-tool", + "link": "https://huggingface.co/slim-q-gen-tiny-tool", + "custom_model_files": [], "custom_model_repo": "", + "output_type": "dict", + "function_call": True, + "primary_keys": ["question"], + "fc_output_values": [], + "tokenizer": "llmware/slim-sentiment", + "tokenizer_local": "tokenizer_tl.json", + "marker_tokens": [], "marker_token_lookup": {}, + "function": ["generate"], + "snapshot": True}, + + {"model_name": "llmware/slim-q-gen-tiny", "display_name": "slim-q-gen-tiny", + "model_family": "HFGenerativeModel", "model_category": "generative_local", "model_location": "hf_repo", + "context_window": 2048, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.5, "sample_default": True, "trailing_space": "", "gguf_file": "", "gguf_repo": "", + "link": "https://huggingface.co/llmware/slim-q-gen-tiny", + "hf_repo": "llmware/slim-q-gen-tiny", + "custom_model_files": [""], "custom_model_repo": "", + "output_type": "dict", "function_call": True, + "primary_keys": ["question"], + "fc_output_values": ["question"], + "marker_tokens": [], + "marker_token_lookup": {}, + "function": ["generate"]}, + + {"model_name": "llmware/slim-q-gen-phi-3", "display_name": "slim-q-gen-phi-3", + "model_family": "HFGenerativeModel", "model_category": "generative_local", "model_location": "hf_repo", + "context_window": 2048, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.5, "sample_default": True, "trailing_space": "", "gguf_file": "", "gguf_repo": "", + "link": "https://huggingface.co/llmware/slim-q-gen-phi-3", + "hf_repo": "llmware/slim-q-gen-phi-3", + "custom_model_files": [""], "custom_model_repo": "", + "output_type": "dict", "function_call": True, + "primary_keys": ["question"], + "fc_output_values": ["question"], + "marker_tokens": [], + "marker_token_lookup": {}, + "function": ["generate"]}, + + {"model_name": "slim-qa-gen-tiny-tool", "display_name": "llmware/slim-qa-gen-tiny-tool", + "model_family": "GGUFGenerativeModel", "model_category": "generative_local", "model_location": "llmware_repo", + "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.5, "sample_default": True, "trailing_space": "", + "gguf_file": "qa_gen_v3.gguf", + "gguf_repo": "llmware/slim-qa-gen-tiny-tool", + "link": "https://huggingface.co/slim-qa-gen-tiny-tool", + "custom_model_files": [], "custom_model_repo": "", + "output_type": "dict", + "function_call": True, + "primary_keys": ["question, answer"], # also accepts boolean and multiple choice + "fc_output_values": [], + "tokenizer": "llmware/slim-sentiment", + "tokenizer_local": "tokenizer_tl.json", + "marker_tokens": [], "marker_token_lookup": {}, + "function": ["generate"], + "snapshot": True}, + + {"model_name": "slim-qa-gen-phi-3-tool", "display_name": "slim-qa-gen-phi-3-tool", + "model_family": "GGUFGenerativeModel", "model_category": "generative_local", "model_location": "llmware_repo", + "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.3, "sample_default": True, "trailing_space": "", + "gguf_file": "qa_gen_v3.gguf", + "gguf_repo": "llmware/slim-qa-gen-phi-3-tool", + "link": "https://huggingface.co/llmware/slim-qa-gen-phi-3-tool", + "custom_model_files": [], "custom_model_repo": "", + "output_type": "dict", + "function_call": True, + "primary_keys": ["question, answer"], # also accepts boolean and multiple choice + "fc_output_values": [], + "tokenizer": "microsoft/Phi-3-mini-4k-instruct", + "tokenizer_local": "tokenizer_phi3.json", + "marker_tokens": [], "marker_token_lookup": {}, + "function": ["generate"], + "snapshot": True}, + + {"model_name": "llmware/slim-qa-gen-tiny", "display_name": "slim-qa-gen-tiny", + "model_family": "HFGenerativeModel", "model_category": "generative_local", "model_location": "hf_repo", + "context_window": 2048, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.5, "sample_default": True, "trailing_space": "", "gguf_file": "", "gguf_repo": "", + "link": "https://huggingface.co/llmware/slim-qa-gen-tiny", + "hf_repo": "llmware/slim-qa-gen-tiny", + "custom_model_files": [""], "custom_model_repo": "", + "output_type": "dict", "function_call": True, + "primary_keys": ["question, answer"], + "fc_output_values": ["question, answer"], + "marker_tokens": [], + "marker_token_lookup": {}, + "function": ["generate"]}, + + {"model_name": "llmware/slim-qa-gen-phi-3", "display_name": "slim-qa-gen-phi-3", + "model_family": "HFGenerativeModel", "model_category": "generative_local", "model_location": "hf_repo", + "context_window": 2048, "instruction_following": False, "prompt_wrapper": "human_bot", + "temperature": 0.5, "sample_default": True, "trailing_space": "", "gguf_file": "", "gguf_repo": "", + "link": "https://huggingface.co/llmware/slim-qa-gen-phi-3", + "hf_repo": "llmware/slim-qa-gen-phi-3", + "custom_model_files": [""], "custom_model_repo": "", + "output_type": "dict", "function_call": True, + "primary_keys": ["question, answer"], + "fc_output_values": ["question, answer"], + "marker_tokens": [], + "marker_token_lookup": {}, + "function": ["generate"]} ] From 8f132646b644dae2befe8ff0f7f27491d9b1ee84 Mon Sep 17 00:00:00 2001 From: wjt55 Date: Wed, 29 May 2024 09:45:18 -0400 Subject: [PATCH 03/48] fixed spelling error --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 64f7e143..856d5a8b 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -40,7 +40,7 @@ class Library: """Implements the interface to manage a collection of texts and images as a ``Library``. - ``Library`` is responsible for managing a collection of unstructured inofrmation, i.e. a library is a + ``Library`` is responsible for managing a collection of unstructured information, i.e. a library is a collection of texts and images. Returns From 8da7a2ee0d4c68a4dd8d680ad750cfc49179bcc1 Mon Sep 17 00:00:00 2001 From: willtaner Date: Wed, 29 May 2024 13:55:44 -0400 Subject: [PATCH 04/48] added docstrings to all Library methods --- llmware/library.py | 541 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 512 insertions(+), 29 deletions(-) diff --git a/llmware/library.py b/llmware/library.py index 856d5a8b..1a20b94d 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -132,7 +132,25 @@ def __init__(self): # explicit constructor to create a new library def create_new_library(self, library_name, account_name="llmware"): - """ Explicit constructor to create a new library with selected name """ + """ Explicit constructor to create a new library with selected name. + + If a library with the same name already exists, it will load the existing library. + + Checks if library_name is safe. If not, it will change library_name to a safe name. + + Parameters + ---------- + library_name : str + name of the library to create + + account_name : str, default="llmware" + name of the account associated with the library + + Returns + ------- + library : Library + A new ``Library`` object representing the newly created or loaded existing library + """ # note: default behavior - if library with same name already exists, then it loads existing library @@ -239,7 +257,21 @@ def create_new_library(self, library_name, account_name="llmware"): def load_library(self, library_name, account_name="llmware"): - """ Load an existing library by invoking the library string name """ + """ Load an existing library by invoking the library string name + + Parameters + ---------- + library_name : str + Name of the library to load + + account_name : str, default="llmware" + Name of the account associated with the library + + Returns + ------- + library : Library + A new ``Library`` object representing the loaded library + """ # first check that library exists library_exists = self.check_if_library_exists(library_name, account_name=account_name) @@ -274,7 +306,21 @@ def load_library(self, library_name, account_name="llmware"): def get_library_card(self, library_name=None, account_name="llmware"): - """ Retrieves the library card dictionary with key attributes of library """ + """ Retrieves the library card dictionary with key attributes of library + + Parameters + ---------- + library_name : str, default=None + Name of the library to retrieve. If not provided, uses self.library_name + + account_name : str, default="llmware" + Name of the account associated to the library. If not provided, uses self.account_name + + Returns + ------- + library_card : dict or None + The library card dictionary containing key atrributes of the library. If not found, returns None + """ library_card = None @@ -296,7 +342,22 @@ def get_library_card(self, library_name=None, account_name="llmware"): def check_if_library_exists(self, library_name, account_name="llmware"): - """ Check if library exists by library string name """ + """ Check if library exists by library string name + + Parameters + ---------- + library_name : str + Name of library to check. + + account_name : str, default="llmware" + Name of account associated with library. + + Returns + ------- + library_card : dict or None + The library card dict if the library exists. If not found, returns None. + + """ # first look in library catalog library_card = LibraryCatalog().get_library_card(library_name, account_name=account_name) @@ -325,7 +386,36 @@ def update_embedding_status (self, status_message, embedding_model, embedding_db embedded_blocks=0, embedding_dims=0,time_stamp="NA",delete_record=False): """ Invoked at the end of the embedding job to update the library card and embedding record -- generally, - this method does not need to be invoked directly """ + this method does not need to be invoked directly + + Parameters + ---------- + status_message : str + Status message for the embedding process. If "delete", the record will be marked for deletion. + + embedding_model : str + Name of the embedding model used. + + embedding_db : str + Name of the embedding database used. + + embedded_blocks : int, default=0 + Number of embedded blocks. + + embedding_dims : int, default=0 + Dimensions of the embedding. + + time_stamp : str, default="NA" + Timestamp of the embedding process. + + delete_record : bool, default=False + If True, marks the record for deletion. + + Returns + ------- + bool + True if the embedding status was successfully updated. + """ # special handling for updating "embedding" in update_library_card # -- append/insert this new embedding dict to the end of the embedding list @@ -347,7 +437,14 @@ def update_embedding_status (self, status_message, embedding_model, embedding_db def get_embedding_status (self): - """ Pulls the embedding record for the current library from the library card """ + """ Pulls the embedding record for the current library from the library card + + Returns + ------- + embedding_record : list or None + The embedding record, which is a list of dictionaries containing embedding status, model, and database. + If the library card or embedding record is not found, returns None. + """ library_card = LibraryCatalog(self).get_library_card(self.library_name, account_name=self.account_name) @@ -369,7 +466,13 @@ def get_embedding_status (self): def get_knowledge_graph_status (self): - """ Gets the status of creating the knowledge graph for the current library from the library card """ + """ Gets the status of creating the knowledge graph for the current library from the library card + + Returns + ------- + status_message : str + The status of the knowledge graph creation for the current library. + """ library_card = LibraryCatalog(self).get_library_card(self.library_name, self.account_name) @@ -383,7 +486,18 @@ def get_knowledge_graph_status (self): def set_knowledge_graph_status (self, status_message): - """ Updates the knowledge graph status on the library card after creating a knowledge graph """ + """ Updates the knowledge graph status on the library card after creating a knowledge graph + + Parameters + ---------- + status_message : str + The status message to set for the knowledge graph. + + Returns + ------- + bool + True if the knowledge graph status was successfully updated. + """ update_dict = {"knowledge_graph": status_message} updater = LibraryCatalog(self).update_library_card(self.library_name,update_dict, account_name=self.account_name) @@ -393,7 +507,13 @@ def set_knowledge_graph_status (self, status_message): def get_and_increment_doc_id(self): """ Convenience method in library class - mirrors method in LibraryCatalog - increments, tracks and provides a - unique doc id for the library """ + unique doc id for the library + + Returns + ------- + unique_doc_id : int + The new unique document ID for the library. + """ unique_doc_id = LibraryCatalog(self).get_and_increment_doc_id(self.library_name) return unique_doc_id @@ -401,7 +521,30 @@ def get_and_increment_doc_id(self): def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added_images=0, added_pages=0, added_tables=0): - """ Updates the library card with incremental counters after completing a parsing job """ + """ Updates the library card with incremental counters after completing a parsing job + + Parameters + ---------- + added_docs : int, default=0 + Number of documents added. + + added_blocks : int, default=0 + Number of blocks added. + + added_images : int, default=0 + Number of images added. + + added_pages : int, default=0 + Number of pages added. + + added_tables : int, default=0 + Number of tables added. + + Returns + ------- + bool + True if the incremental counters were successfully updated. + """ # updates counting parameters at end of parsing updater = LibraryCatalog(self).set_incremental_docs_blocks_images(added_docs=added_docs, @@ -415,7 +558,18 @@ def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added def add_file(self, file_path): """ Ingests, parses, text chunks and indexes a single selected file to a library - - provide the full path to file """ + provide the full path to file + + Parameters + ---------- + file_path : str + The full path to the file to be ingested and indexed. + + Returns + ------- + self : Library + The updated ``Library`` object after adding the file. + """ # Ensure the input path exists os.makedirs(LLMWareConfig.get_input_path(), exist_ok=True) @@ -432,7 +586,55 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, verbose_level=2, copy_files_to_library=True): """ Main method to integrate documents into a Library - pass a local filepath folder and all files will be - routed to appropriate parser by file type extension """ + routed to appropriate parser by file type extension + + Parameters + ---------- + input_folder_path : str, default=None + The path to the folder containing files to be ingested. If not provided, defaults to None. + + encoding : str, default="utf-8" + The encoding to use for reading files. + + chunk_size : int, default=400 + The size of text chunks to create during parsing. + + get_images : bool, default=True + Whether to extract images from the documents. + + get_tables : bool, default=True + Whether to extract tables from the documents. + + smart_chunking : int, default=1 + The strategy for smart chunking of text. + + max_chunk_size : int, default=600 + The maximum size of text chunks. + + table_grid : bool, default=True + Whether to use a grid for tables. + + get_header_text : bool, default=True + Whether to extract header text from the documents. + + table_strategy : int, default=1 + The strategy to use for table extraction. + + strip_header : bool, default=False + Whether to strip headers from the documents. + + verbose_level : int, default=2 + The level of verbosity for logging. + + copy_files_to_library : bool, default=True + Whether to copy the files to the library. + + Returns + ------- + output_results : dict or None + A dictionary containing the results of the document integration process, including counts of added documents, + blocks, images, pages, tables, and rejected files. If the library card could not be identified, returns None. + """ if not input_folder_path: input_folder_path = LLMWareConfig.get_input_path() @@ -483,7 +685,30 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_text=True, include_tables=True, include_images=False): - """ Exports library collection of indexed text chunks to a txt file """ + """ Exports library collection of indexed text chunks to a txt file + + Parameters + ---------- + output_fp : str, default=None + The file path where the output file will be saved. If not provided, defaults to None. + + output_fn : str, default=None + The name of the output file. If not provided, defaults to None. + + include_text : bool, default=True + Whether to include text content in the export. + + include_tables : bool, default=True + Whether to include tables in the export. + + include_images : bool, default=False + Whether to include images in the export. + + Returns + ------- + file_location : str + The location of the exported txt file. + """ if not output_fp: output_fp = self.output_path @@ -517,7 +742,33 @@ def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_tex def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, include_tables=True, include_images=False, dict_keys=None): - """ Exports collection of text chunks to a jsonl file """ + """ Exports collection of text chunks to a jsonl file + + Parameters + ---------- + output_fp : str + The file path where the output file will be saved. + + output_fn : str + The name of the output file. + + include_text : bool, default=True + Whether to include text content in the export. + + include_tables : bool, default=True + Whether to include tables in the export. + + include_images : bool, default=False + Whether to include images in the export. + + dict_keys : list of str, default=None + The keys to include in the JSONL entries. If not provided, defaults to None. + + Returns + ------- + file_location : str + The location of the exported JSONL file. + """ if not output_fp: output_fp = self.output_path @@ -563,7 +814,24 @@ def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None, bucket_name=None): - """ Pull files from private S3 bucket into local cache for further processing """ + """ Pull files from private S3 bucket into local cache for further processing + + Parameters + ---------- + aws_access_key : str, default=None + The AWS access key for connecting to the S3 bucket. + + aws_secret_key : str, default=None + The AWS secret key for connecting to the S3 bucket. + + bucket_name : str, default=None + The name of the S3 bucket from which to pull files. + + Returns + ------- + files_copied : list + A list of file paths that were copied from the S3 bucket to the local cache. + """ files_copied = CloudBucketManager().connect_to_user_s3_bucket (aws_access_key, aws_secret_key, bucket_name, LLMWareConfig.get_input_path()) @@ -572,7 +840,13 @@ def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None def generate_knowledge_graph(self): - """ Builds a statistical co-occurrence matrix for a library """ + """ Builds a statistical co-occurrence matrix for a library + + Returns + ------- + int + Returns 0 after successfully generating the knowledge graph and updating the status. + """ kg = Graph(library=self).build_graph() self.set_knowledge_graph_status("yes") @@ -583,7 +857,48 @@ def install_new_embedding (self, embedding_model_name=None, vector_db=None, from_hf= False, from_sentence_transformer=False, model=None, tokenizer=None, model_api_key=None, vector_db_api_key=None, batch_size=500, max_len=None, use_gpu=True): - """ Main method for installing a new embedding on a library """ + """ Main method for installing a new embedding on a library + + Parameters + ---------- + embedding_model_name : str, default=None + The name of the embedding model to use. + + vector_db : str, default=None + The name of the vector database to use. + + from_hf : bool, default=False + Whether the model is from Hugging Face. + + from_sentence_transformer : bool, default=False + Whether the model is a Sentence Transformer. + + model : object, default=None + The pre-loaded model to use. + + tokenizer : object, default=None + The tokenizer associated with the pre-loaded model. + + model_api_key : str, default=None + The API key for accessing the model. + + vector_db_api_key : str, default=None + The API key for accessing the vector database. + + batch_size : int, default=500 + The batch size to use for embedding. + + max_len : int, default=None + The maximum length for embedding. + + use_gpu : bool, default=True + Whether to use GPU for embedding. + + Returns + ------- + embeddings : dict or None + The created embeddings dict, or None if no embeddings could be created. + """ embeddings = None my_model = None @@ -633,7 +948,21 @@ def install_new_embedding (self, embedding_model_name=None, vector_db=None, def delete_library(self, library_name=None, confirm_delete=False): - """ Deletes all artifacts of a library """ + """ Deletes all artifacts of a library + + Parameters + ---------- + library_name : str, default=None + The name of the library to delete. If not provided, defaults to None. + + confirm_delete : bool, default=False + Confirmation flag to proceed with deletion. Must be set to True to delete the library. + + Returns + ------- + success_code : int + Returns 1 if the deletion was successful, or -1 if an error occurred. + """ if library_name: self.library_name = library_name @@ -664,7 +993,27 @@ def delete_library(self, library_name=None, confirm_delete=False): def update_block (self, doc_id, block_id, key, new_value): """ Convenience method to update the record of a specific block - identified by doc_ID and block_ID - in text collection database """ + in text collection database + + Parameters + ---------- + doc_id : int + The ID of the document containing the block to update. + + block_id : int + The ID of the block to update. + + key : str + The key in the block record to update. + + new_value : str + The new value to set for the specified key. + + Returns + ------- + completed : bool + True if the block was successfully updated, False otherwise. + """ completed = (CollectionWriter(self.library_name, account_name=self.account_name). update_block(doc_id, block_id,key,new_value,self.default_keys)) @@ -673,7 +1022,24 @@ def update_block (self, doc_id, block_id, key, new_value): def add_website (self, url, get_links=True, max_links=5): - """ Main method to ingest a website into a library """ + """ Main method to ingest a website into a library + + Parameters + ---------- + url : str + The URL of the website to ingest. + + get_links : bool, default=True + Whether to follow and ingest links found on the website. + + max_links : int, default=5 + The maximum number of links to follow and ingest. + + Returns + ------- + self : Library + The updated ``Library`` object after ingesting the website. + """ Parser(library=self).parse_website(url,get_links=get_links,max_links=max_links) CollectionWriter(self.library_name, account_name=self.account_name).build_text_index() @@ -682,7 +1048,21 @@ def add_website (self, url, get_links=True, max_links=5): def add_wiki(self, topic_list,target_results=10): - """ Main method to add a wikipedia article to a library - enter a list of topics """ + """ Main method to add a wikipedia article to a library - enter a list of topics + + Parameters + ---------- + topic_list : list of str + A list of topics to search for on Wikipedia. + + target_results : int, default=10 + The target number of results to retrieve for each topic. + + Returns + ------- + self : Library + The updated ``Library`` object after adding the Wikipedia articles. + """ Parser(library=self).parse_wiki(topic_list,target_results=target_results) CollectionWriter(self.library_name, account_name=self.account_name).build_text_index() @@ -691,7 +1071,18 @@ def add_wiki(self, topic_list,target_results=10): def add_dialogs(self, input_folder=None): - """ Main method to add an AWS dialog transcript into a library """ + """ Main method to add an AWS dialog transcript into a library + + Parameters + ---------- + input_folder : str, default=None + The path to the folder containing the dialog transcripts. If not provided, defaults to None. + + Returns + ------- + self : Library + The updated ``Library`` object after adding the dialog transcripts. + """ if not input_folder: input_folder = LLMWareConfig.get_input_path() @@ -702,7 +1093,18 @@ def add_dialogs(self, input_folder=None): def add_image(self, input_folder=None): - """ Main method to add image and scanned OCR content into a library """ + """ Main method to add image and scanned OCR content into a library + + Parameters + ---------- + input_folder : str, default=None + The path to the folder containing the images. If not provided, defaults to None + + Returns + ------- + self : Library + The updated ``Library`` object after adding the image and OCR content. + """ if not input_folder: input_folder = LLMWareConfig.get_input_path() @@ -713,7 +1115,18 @@ def add_image(self, input_folder=None): def add_pdf_by_ocr(self, input_folder=None): - """ Alternative method to ingest PDFs that are scanned, or can not be otherwise parsed """ + """ Alternative method to ingest PDFs that are scanned, or can not be otherwise parsed + + Parameters + ---------- + input_folder : str, default=None + The path to the folder containing the PDFs. If not provided, defaults to None + + Returns + ------- + self : Library + The updated ``Library`` object after adding the PDFs through OCR. + """ if not input_folder: input_folder = LLMWareConfig.get_input_path() @@ -724,7 +1137,18 @@ def add_pdf_by_ocr(self, input_folder=None): def add_pdf(self, input_folder=None): - """ Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option.""" + """ Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. + + Parameters + ---------- + input_folder : str, default=None + The path to the folder containing the PDFs. If not provided, defaults to None + + Returns + ------- + self : Library + The updated ``Library`` object after adding the PDFs. + """ if not input_folder: input_folder = LLMWareConfig.get_input_path() @@ -735,7 +1159,18 @@ def add_pdf(self, input_folder=None): def add_office(self, input_folder=None): - """ Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option.""" + """ Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. + + Parameters + ---------- + input_folder : str, default=None + The path to the folder containing the Office documents. If not provided, defaults to None. + + Returns + ------- + self : Library + The updated ``Library`` object after adding the Office documents. + """ if not input_folder: input_folder = LLMWareConfig.get_input_path() @@ -746,14 +1181,42 @@ def add_office(self, input_folder=None): def get_all_library_cards(self, account_name='llmware'): - """ Get all library cards for all libraries on account """ + """ Get all library cards for all libraries on account + + Parameters + ---------- + account_name : str, default='llmware' + The name of the account for which to retrieve all library cards. + + Returns + ------- + library_cards : list of dict + A list of all library card dictionaries for the specified account. + """ library_cards = LibraryCatalog(account_name=account_name).all_library_cards() return library_cards def delete_installed_embedding(self, embedding_model_name, vector_db, vector_db_api_key=None): - """ Deletes an installed embedding on specific combination of vector_db + embedding_model_name """ + """ Deletes an installed embedding on specific combination of vector_db + embedding_model_name + + Parameters + ---------- + embedding_model_name : str + The name of the embedding model to delete. + + vector_db : str + The name of the vector database from which to delete the embedding. + + vector_db_api_key : str, default=None + The API key for accessing the vector database. If not provided, defaults to None + + Returns + ------- + int + Returns 1 if the embedding was successfully deleted. + """ # insert safety check - confirm that this is valid combination with installed embedding lib_card = LibraryCatalog(self).get_library_card(self.library_name) @@ -782,7 +1245,27 @@ def run_ocr_on_images(self, add_to_library=False,chunk_size=400,min_size=10, rea """ Convenience method in Library class to pass Library to Parser to run OCR on all of the images found in the Library, and OCR-extracted text from the images directly into the Library as additional - blocks. """ + blocks. + + Parameters + ---------- + add_to_library : bool, default=False + Whether to add the OCR-extracted text directly into the Library as additional blocks. + + chunk_size : int, default=400 + The size of text chunks to create during OCR processing. + + min_size : int, default=10 + The minimum size of text chunks to consider during OCR processing. + + realtime_progress : bool, default=True + Whether to display real-time progress during OCR processing. + + Returns + ------- + output : int + Returns 1 if running the OCR on the images was successful. + """ output = Parser(library=self).ocr_images_in_library(add_to_library=add_to_library, chunk_size=chunk_size,min_size=min_size, From 479fddd94d4c2b4dbdf02fcc2393302f45ca6ea6 Mon Sep 17 00:00:00 2001 From: DARREN OBERST Date: Wed, 29 May 2024 14:41:18 -0400 Subject: [PATCH 05/48] update to model class instantiation --- .../using-slim-q-gen.py | 0 .../using-slim-qa-gen.py | 0 llmware/agents.py | 61 ++++++--- llmware/models.py | 127 ++++++++++++++---- 4 files changed, 145 insertions(+), 43 deletions(-) rename examples/{Models => SLIM-Agents}/using-slim-q-gen.py (100%) rename examples/{Models => SLIM-Agents}/using-slim-qa-gen.py (100%) diff --git a/examples/Models/using-slim-q-gen.py b/examples/SLIM-Agents/using-slim-q-gen.py similarity index 100% rename from examples/Models/using-slim-q-gen.py rename to examples/SLIM-Agents/using-slim-q-gen.py diff --git a/examples/Models/using-slim-qa-gen.py b/examples/SLIM-Agents/using-slim-qa-gen.py similarity index 100% rename from examples/Models/using-slim-qa-gen.py rename to examples/SLIM-Agents/using-slim-qa-gen.py diff --git a/llmware/agents.py b/llmware/agents.py index 63312634..6f810e4b 100755 --- a/llmware/agents.py +++ b/llmware/agents.py @@ -1,5 +1,20 @@ + +# Copyright 2023-2024 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + """The agents module implements the two classes LLMfx and SQLTables, where LLMfx manages -Structured Language Instruction Models (SLIMs), the agents and SQLTables handels +Structured Language Instruction Models (SLIMs), the agents and SQLTables handles creating and accessing external SQL data. LLmfx currently only supports SLIM models, other model classes will be added over time. And SQLTables is an experimantal feature for creating and accessing SQLite. @@ -24,7 +39,8 @@ class LLMfx: - """Provides an interface to models to interact with text, e.g. to first perform named entity recogintion + + """Provides an interface to models to interact with text, e.g. to first perform named entity recognition (ner) and then answer a question you want to have answered. ``LLMfx`` provides a high-level orchestration abstraction that implements multi-model, multi-step processes @@ -49,20 +65,6 @@ class LLMfx: llmfx : LLMfx A new ``LLMfx`` object. - Examples - ---------- - >>> import llmware.agents - >>> llmware_agent = llmare.agents.LLMfx() - >>> llmware_agent.load_work( - 'My name is Michael Jones, and I am a long-time customer. ' - 'The Mixco product is not working currently, and it is having a negative impact ' - 'on my business, as we can not deliver our products while it is down. ' - 'This is the fourth time that I have called. My account number is 93203, and ' - 'my user name is mjones. Our company is based in Tampa, Florida.') - >>> llmware_agent.exec_function_call('ratings') - >>> llmware_agent.answer('What is a short shummary?', key='summary') - >>> llmware_agent.answer('What is the customer\'s account number and user name?', key='customer_info') - >>> llmware_agent.show_report() """ def __init__(self, api_key=None, verbose=True, analyze_mode=True): @@ -890,6 +892,33 @@ def nli(self, text1, text2, params=None): return self.exec_function_call("nli", text=context, params=params) + def q_gen(self, text=None, params=None): + + """ Executes a question-gen function call on a text, if passed directly, or will pull current work item from + the queue. Returns value output dictionary with the generated question. """ + + if not params: + params = ["question"] + + if isinstance(params, str): + params = [params] + + return self.exec_function_call("q_gen", text=text, params=params) + + def qa_gen(self, text=None, params=None): + + """ Executes a question-answer gen function call on a text, if passed directly, or will pull current work + item from the queue. Returns value output dictionary with two keys - "question" and "answer" generated. """ + + if not params: + # default parameter key + params = ["question, answer"] + + if isinstance(params, str): + params = [params] + + return self.exec_function_call("qa_gen", text=text, params=params) + def verify_llm_response(self, input_context, llm_response): """ Utility function to apply NLI to compare llm_response with the input context. """ diff --git a/llmware/models.py b/llmware/models.py index 2b133d52..b649af8c 100644 --- a/llmware/models.py +++ b/llmware/models.py @@ -1,4 +1,4 @@ -# Copyright 2023 llmware +# Copyright 2023-2024 llmware # Licensed under the Apache License, Version 2.0 (the "License"); you # may not use this file except in compliance with the License. You @@ -60,11 +60,25 @@ class _ModelRegistry: # pulls default model list from model_configs.py registered_models = global_model_repo_catalog_list - model_classes = ["HFGenerativeModel", "LLMWareModel", "GGUFGenerativeModel", "WhisperCPPModel", - "LLMWareSemanticModel", "HFEmbeddingModel", "OpenChatModel", "OllamaModel", - "OpenAIGenModel", "ClaudeModel", "GoogleGenModel", - "CohereGenModel", "JurassicModel", "AIBReadGPTModel", - "OpenAIEmbeddingModel", "CohereEmbeddingModel","GoogleEmbeddingModel"] + # global list of supported model classes with module lookup - and placeholder for other attributes over time + model_classes = {"HFGenerativeModel": {"module": "llmware.models", "open_source":True}, + "LLMWareModel": {"module": "llmware.models", "open_source": True}, + "GGUFGenerativeModel": {"module": "llmware.models", "open_source":True}, + "WhisperCPPModel": {"module": "llmware.models", "open_source": True}, + "LLMWareSemanticModel": {"module": "llmware.models", "open_source": True}, + "HFEmbeddingModel": {"module": "llmware.models", "open_source": True}, + "OpenChatModel": {"module": "llmware.models", "open_source": True}, + "OllamaModel":{"module": "llmware.models", "open_source": True}, + "OpenAIGenModel":{"module": "llmware.models", "open_source": False}, + "ClaudeModel":{"module": "llmware.models", "open_source": False}, + "GoogleGenModel":{"module": "llmware.models", "open_source": False}, + "CohereGenModel":{"module": "llmware.models", "open_source": False}, + "JurassicModel":{"module": "llmware.models", "open_source": False}, + "AIBReadGPTModel":{"module": "llmware.models", "open_source": True}, + "OpenAIEmbeddingModel":{"module": "llmware.models", "open_source": False}, + "CohereEmbeddingModel":{"module": "llmware.models", "open_source": False}, + "GoogleEmbeddingModel":{"module": "llmware.models", "open_source": False} + } # model card validation for registering new model - required attributes min_required_fields = ["model_name", "model_family", "model_category"] @@ -76,11 +90,11 @@ class _ModelRegistry: registered_wrappers = global_model_finetuning_prompt_wrappers_lookup - # list of function calling classifier tools + # list of specialized function calling tools llm_fx_tools = ["ner", "sentiment", "topics", "ratings", "emotions", "nli", "intent", "sql", "answer", "category", "tags", "summary", "xsum", "extract", - "boolean", "sa-ner","tags-3b"] + "boolean", "sa-ner","tags-3b", "q_gen", "qa_gen"] llm_fx_tools_map = {"ner": "slim-ner-tool", "sentiment": "slim-sentiment-tool", @@ -93,13 +107,14 @@ class _ModelRegistry: "answer": "bling-answer-tool", "category": "slim-category-tool", "intent": "slim-intent-tool", - # new tools added "summary": "slim-summary-tool", "xsum": "slim-xsum-tool", "extract": "slim-extract-tool", "boolean": "slim-boolean-tool", "sa-ner": "slim-sa-ner-tool", - "tags-3b": "slim-tags-3b-tool" + "tags-3b": "slim-tags-3b-tool", + "q_gen": "slim-q-gen-tiny-tool", + "qa_gen": "slim-qa-gen-tiny-tool" } @classmethod @@ -113,12 +128,12 @@ def get_model_classes(cls): return cls.model_classes @classmethod - def add_model_class(cls, new_class): + def add_model_class(cls, new_class, module="llmware.models", open_source=False): - """ Adds a new model class. By default, it assumes that the module is the current module, - e.g., 'llmware.models'. These options will be expanded in upcoming releases. """ + """ Adds a new model with flexibility to instantiate in new module. By default, it + assumes that the module is the current one, e.g., 'llmware.models'. """ - cls.model_classes.append(new_class) + cls.model_classes.update({new_class:{"module": module, "open_source": open_source}}) @classmethod def get_wrapper_list(cls): @@ -268,12 +283,6 @@ def __init__(self): # Builds on standard model classes with standard inference self.model_classes = _ModelRegistry().get_model_classes() - - # hard-coded list to be replaced in future release - self.open_source_model_classes = ["HFGenerativeModel", "LLMWareModel", "GGUFGenerativeModel", - "LLMWareSemanticModel","HFEmbeddingModel", "OpenChatModel", - "OllamaModel", "WhisperCPPModel"] - self.global_model_list = _ModelRegistry().get_model_list() self.account_name = None @@ -323,6 +332,23 @@ def load_model_registry(self, fp=None, fn="llmware_model_catalog.json"): return 0 + def add_model_cards_from_file(self, fp=None, fn="custom_models_manifest.json"): + + """ Utility method that loads model cards from a single json file and incrementally adds + to the model global model list. """ + + if not fp: + fp = LLMWareConfig().get_model_repo_path() + + model_add_list = json.load(open(os.path.join(fp, fn), "r")) + + for i, model in enumerate(model_add_list): + _ModelRegistry().add_model(model) + + self.global_model_list = _ModelRegistry().get_model_list() + + return 0 + def register_new_model_card(self, model_card_dict): """ Registers a new model card directly in the model catalog """ @@ -672,12 +698,14 @@ def locate_and_retrieve_model_bits (self, model_card, api_key=None): raise ModelNotFoundException(model_folder_name) - def _instantiate_model_class_from_string(self, model_class, model_name, model_card, api_key=None, - api_endpoint=None): + def _instantiate_model_class_from_string_deprecated(self, model_class, model_name, model_card, api_key=None, + api_endpoint=None): - """ Internal utility method to instantiate model classes from strings. + """ DEPRECATED - Internal utility method to instantiate model classes from strings. Provides an + explicit lookup to model class - deprecated and replaced with dynamic import to provide more + flexibility and extensibility to add new model classes from other modules. - NOTE: this method will be replaced and deprecated for importlib dynamic lookup in upcoming release. + NOTE: will be removed in upcoming release. """ @@ -788,6 +816,45 @@ def _instantiate_model_class_from_string(self, model_class, model_name, model_ca return my_model + def _instantiate_model_class_from_string(self, model_class, model_name, model_card, api_key=None, + api_endpoint=None): + + """ Internal utility method to instantiate model classes from strings. """ + + # by default - if model not found - return None + my_model = None + context_window= 2048 # used in generative models - use 2048 as default safe backup + embedding_dims = None # used in embedding models + + if "context_window" in model_card: + context_window = model_card["context_window"] + + if "embedding_dims" in model_card: + embedding_dims = model_card["embedding_dims"] + + if model_class in self.model_classes: + + module = self.model_classes[model_class]["module"] + model_module = importlib.import_module(module) + if hasattr(model_module, model_class): + model_class = getattr(model_module, model_class) + + my_model = model_class(model_name=model_name, context_window=context_window, + api_key=api_key, + trust_remote_code=True, + model_card=model_card, + use_gpu_if_available=self.use_gpu, + get_logits=self.get_logits, + temperature=self.temperature, + max_output=self.max_output, + sample=self.sample, + embedding_dims=embedding_dims, + api_endpoint=api_endpoint) + else: + raise LLMWareException(message=f"Exception: {model_class} not found.") + + return my_model + def load_model (self, selected_model, api_key=None, use_gpu=True, sample=True,get_logits=False, max_output=100, temperature=-99, force_reload=False, api_endpoint=None): @@ -921,14 +988,20 @@ def load_embedding_model (self, model_name=None, def list_open_source_models(self): - """ Lists the open source models in the ModelCatalog. This method will be updated/replaced in - future release. """ + """ Lists the open source models in the ModelCatalog. """ open_source_models = [] + open_source_class = [] + model_classes = _ModelRegistry().get_model_classes() + for key, value in model_classes.items(): + if "open_source" in value: + if value["open_source"]: + open_source_class.append(key) + for x in self.global_model_list: - if x["model_family"] in self.open_source_model_classes: + if x["model_family"] in open_source_class: open_source_models.append(x) return open_source_models From b959122614133fc5483f50255326f2e47fd22718 Mon Sep 17 00:00:00 2001 From: Darren Oberst <41238031+doberst@users.noreply.github.com> Date: Wed, 29 May 2024 14:59:25 -0400 Subject: [PATCH 06/48] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 14415851..63fdc118 100644 --- a/README.md +++ b/README.md @@ -804,6 +804,13 @@ Questions and discussions are welcome in our [github discussions](https://github See also [additional deployment/install release notes in wheel_archives](https://github.com/llmware-ai/llmware/tree/main/wheel_archives) +**Wednesday, May 29 - v0.3.0-WIP** +- Added two new SLIM models to catalog and agent processes - ['q-gen'](https://github.com/llmware-ai/llmware/tree/main/examples/SLIM-Agents/using-slim-q-gen.py) and ['qa-gen'](https://github.com/llmware-ai/llmware/tree/main/examples/SLIM-Agents/using-slim-qa-gen.py) +- Updated model class instantiation to provide more extensibility to add new classes in different modules +- Planning to remove torch and transformers from pip install package +- If cloning the repo, please see the new welcome_to_llmware.sh and welcome_to_llmware_windows.sh fast install scripts +- Changes merged in main branch and will be released in pypi targeting end of week + **Wednesday, May 22 - v0.2.15** - Improvements in Model class handling of Pytorch and Transformers dependencies (just-in-time loading, if needed) - Expanding API endpoint options and inference server functionality - see new [client access options](https://github.com/llmware-ai/llmware/tree/main/examples/Use_Cases/llmware_inference_api_client.py) and [server_launch](https://github.com/llmware-ai/llmware/tree/main/examples/Use_Cases/llmware_inference_server.py) From 394f70cef4179404a7668b224675676bb54909e7 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 20:55:40 -0400 Subject: [PATCH 07/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 1a20b94d..58e061f3 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -132,7 +132,7 @@ def __init__(self): # explicit constructor to create a new library def create_new_library(self, library_name, account_name="llmware"): - """ Explicit constructor to create a new library with selected name. + """Explicit constructor to create a new library with selected name. If a library with the same name already exists, it will load the existing library. From 4ffb1fb06184fc70a0dbbd417e673bcf6001efd7 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 20:56:42 -0400 Subject: [PATCH 08/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 58e061f3..3b77ca60 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -257,7 +257,7 @@ def create_new_library(self, library_name, account_name="llmware"): def load_library(self, library_name, account_name="llmware"): - """ Load an existing library by invoking the library string name + """Load an existing library by invoking the library string name. Parameters ---------- From 8678f73741d0b461f5df102d95567fda1cd5a73b Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 20:57:44 -0400 Subject: [PATCH 09/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 3b77ca60..4862e3fa 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -306,7 +306,7 @@ def load_library(self, library_name, account_name="llmware"): def get_library_card(self, library_name=None, account_name="llmware"): - """ Retrieves the library card dictionary with key attributes of library + """Retrieves the library card dictionary with key attributes of library. Parameters ---------- From 0dea9f65fde7eef681e1736d52834c02cc3afac8 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 20:58:07 -0400 Subject: [PATCH 10/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 4862e3fa..4431d5d3 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -385,7 +385,7 @@ def check_if_library_exists(self, library_name, account_name="llmware"): def update_embedding_status (self, status_message, embedding_model, embedding_db, embedded_blocks=0, embedding_dims=0,time_stamp="NA",delete_record=False): - """ Invoked at the end of the embedding job to update the library card and embedding record -- generally, + """Invoked at the end of the embedding job to update the library card and embedding record -- generally, this method does not need to be invoked directly Parameters From 7a606e8bc7994096c45531f12cc6b22d20624afc Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 20:58:25 -0400 Subject: [PATCH 11/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 4431d5d3..b7050f2b 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -342,7 +342,7 @@ def get_library_card(self, library_name=None, account_name="llmware"): def check_if_library_exists(self, library_name, account_name="llmware"): - """ Check if library exists by library string name + """Check if library exists by library string name. Parameters ---------- From ac6e628350698b927a7977ced49f09ae218cf4d6 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:01:21 -0400 Subject: [PATCH 12/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index b7050f2b..368ce597 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -386,7 +386,7 @@ def update_embedding_status (self, status_message, embedding_model, embedding_db embedded_blocks=0, embedding_dims=0,time_stamp="NA",delete_record=False): """Invoked at the end of the embedding job to update the library card and embedding record -- generally, - this method does not need to be invoked directly + this method does not need to be invoked directly. Parameters ---------- From 42f428ac04635e0935f55a091f9296b050ec20e3 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:01:40 -0400 Subject: [PATCH 13/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 368ce597..95ae787c 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -437,7 +437,7 @@ def update_embedding_status (self, status_message, embedding_model, embedding_db def get_embedding_status (self): - """ Pulls the embedding record for the current library from the library card + """Pulls the embedding record for the current library from the library card. Returns ------- From c472248069c5e3e3759002fbd9533bd411a6fcd6 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:01:55 -0400 Subject: [PATCH 14/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 95ae787c..2b1b8fa2 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -466,7 +466,7 @@ def get_embedding_status (self): def get_knowledge_graph_status (self): - """ Gets the status of creating the knowledge graph for the current library from the library card + """Gets the status of creating the knowledge graph for the current library from the library card. Returns ------- From bc958689fb251c75df6b28fa450710f99a57f928 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:02:09 -0400 Subject: [PATCH 15/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 2b1b8fa2..4c7a73cd 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -486,7 +486,7 @@ def get_knowledge_graph_status (self): def set_knowledge_graph_status (self, status_message): - """ Updates the knowledge graph status on the library card after creating a knowledge graph + """Updates the knowledge graph status on the library card after creating a knowledge graph. Parameters ---------- From a5fa3cd325d60f820b6925c37a6f25513999945c Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:02:24 -0400 Subject: [PATCH 16/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 4c7a73cd..3cc71bf7 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -506,7 +506,7 @@ def set_knowledge_graph_status (self, status_message): def get_and_increment_doc_id(self): - """ Convenience method in library class - mirrors method in LibraryCatalog - increments, tracks and provides a + """Convenience method in library class - mirrors method in LibraryCatalog - increments, tracks and provides a unique doc id for the library Returns From 666e71f5c64ce5f580db37007b18591e51e59678 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:02:38 -0400 Subject: [PATCH 17/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 3cc71bf7..626afa58 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -507,7 +507,7 @@ def set_knowledge_graph_status (self, status_message): def get_and_increment_doc_id(self): """Convenience method in library class - mirrors method in LibraryCatalog - increments, tracks and provides a - unique doc id for the library + unique doc id for the library. Returns ------- From f94d0cc400683084f8f2ebcda34049e266f3e3c8 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:02:53 -0400 Subject: [PATCH 18/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 626afa58..60021bbb 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -521,7 +521,7 @@ def get_and_increment_doc_id(self): def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added_images=0, added_pages=0, added_tables=0): - """ Updates the library card with incremental counters after completing a parsing job + """Updates the library card with incremental counters after completing a parsing job. Parameters ---------- From 23d2b71db026c17afcc3e3607147fe1ddc288e33 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:03:09 -0400 Subject: [PATCH 19/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 60021bbb..2483926e 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -557,7 +557,7 @@ def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added def add_file(self, file_path): - """ Ingests, parses, text chunks and indexes a single selected file to a library - + """Ingests, parses, text chunks and indexes a single selected file to a library - provide the full path to file Parameters From 6a2b371c8e4c7aa980908ca219dfdd653268df55 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:03:24 -0400 Subject: [PATCH 20/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 2483926e..2022cdeb 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -558,7 +558,7 @@ def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added def add_file(self, file_path): """Ingests, parses, text chunks and indexes a single selected file to a library - - provide the full path to file + provide the full path to file. Parameters ---------- From a8c22b2daf39429db4c12125431f91c7c6104d32 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:03:38 -0400 Subject: [PATCH 21/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 2022cdeb..2df3b76b 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -585,7 +585,7 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, table_grid=True, get_header_text=True, table_strategy=1, strip_header=False, verbose_level=2, copy_files_to_library=True): - """ Main method to integrate documents into a Library - pass a local filepath folder and all files will be + """Main method to integrate documents into a Library - pass a local filepath folder and all files will be routed to appropriate parser by file type extension Parameters From 8f4a1e083c3a31b8a1e2a3d6d0cce397393e3107 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:03:58 -0400 Subject: [PATCH 22/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 2df3b76b..6ef6fc61 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -586,7 +586,7 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, verbose_level=2, copy_files_to_library=True): """Main method to integrate documents into a Library - pass a local filepath folder and all files will be - routed to appropriate parser by file type extension + routed to appropriate parser by file type extension. Parameters ---------- From df3b7700983e563413cfe420ccb149f94f15be89 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:04:12 -0400 Subject: [PATCH 23/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 6ef6fc61..a1710b76 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -685,7 +685,7 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_text=True, include_tables=True, include_images=False): - """ Exports library collection of indexed text chunks to a txt file + """Exports library collection of indexed text chunks to a txt file. Parameters ---------- From 9fe9ad94ffba0c9c5bdc67a2d2a8ffcf971205ed Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:04:26 -0400 Subject: [PATCH 24/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index a1710b76..fced1794 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -742,7 +742,7 @@ def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_tex def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, include_tables=True, include_images=False, dict_keys=None): - """ Exports collection of text chunks to a jsonl file + """Exports collection of text chunks to a jsonl file. Parameters ---------- From 05c8deecb91e02574f6f5ba4214471ac98e377c5 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:04:40 -0400 Subject: [PATCH 25/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index fced1794..d029813b 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -814,7 +814,7 @@ def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None, bucket_name=None): - """ Pull files from private S3 bucket into local cache for further processing + """Pull files from private S3 bucket into local cache for further processing. Parameters ---------- From 9e5527f3f0fb703210e8758f048cd434c1005c01 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:04:53 -0400 Subject: [PATCH 26/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index d029813b..6679dd2e 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -840,7 +840,7 @@ def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None def generate_knowledge_graph(self): - """ Builds a statistical co-occurrence matrix for a library + """Builds a statistical co-occurrence matrix for a library. Returns ------- From 796bc8aa40700bb6937cd44b9079bc0ed06a9121 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:05:06 -0400 Subject: [PATCH 27/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 6679dd2e..a5637775 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -857,7 +857,7 @@ def install_new_embedding (self, embedding_model_name=None, vector_db=None, from_hf= False, from_sentence_transformer=False, model=None, tokenizer=None, model_api_key=None, vector_db_api_key=None, batch_size=500, max_len=None, use_gpu=True): - """ Main method for installing a new embedding on a library + """Main method for installing a new embedding on a library. Parameters ---------- From 46bde4fdd83ae4a700b3a9b9d693350ceed6da1b Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:05:21 -0400 Subject: [PATCH 28/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index a5637775..d8b96cb5 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -948,7 +948,7 @@ def install_new_embedding (self, embedding_model_name=None, vector_db=None, def delete_library(self, library_name=None, confirm_delete=False): - """ Deletes all artifacts of a library + """Deletes all artifacts of a library. Parameters ---------- From f65eed4f1e54493b6a01f424179324fce64b40c7 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:05:39 -0400 Subject: [PATCH 29/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index d8b96cb5..fc0cb955 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -992,7 +992,7 @@ def delete_library(self, library_name=None, confirm_delete=False): def update_block (self, doc_id, block_id, key, new_value): - """ Convenience method to update the record of a specific block - identified by doc_ID and block_ID + """Convenience method to update the record of a specific block - identified by doc_ID and block_ID in text collection database Parameters From ad55baf7a05d0e188c540082a02a90a024d0f494 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:05:54 -0400 Subject: [PATCH 30/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index fc0cb955..c8549a67 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -993,7 +993,7 @@ def delete_library(self, library_name=None, confirm_delete=False): def update_block (self, doc_id, block_id, key, new_value): """Convenience method to update the record of a specific block - identified by doc_ID and block_ID - in text collection database + in text collection database. Parameters ---------- From d159a2ab34869b9d1dd1a222eee8caa1b076c59d Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:06:05 -0400 Subject: [PATCH 31/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index c8549a67..29e9baa3 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1022,7 +1022,7 @@ def update_block (self, doc_id, block_id, key, new_value): def add_website (self, url, get_links=True, max_links=5): - """ Main method to ingest a website into a library + """Main method to ingest a website into a library. Parameters ---------- From 64183f1cce66ee5a5525d990879e2b209fe8aa3c Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:06:17 -0400 Subject: [PATCH 32/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 29e9baa3..b4e08a32 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1048,7 +1048,7 @@ def add_website (self, url, get_links=True, max_links=5): def add_wiki(self, topic_list,target_results=10): - """ Main method to add a wikipedia article to a library - enter a list of topics + """Main method to add a wikipedia article to a library - enter a list of topics. Parameters ---------- From 34507955e0de18521962a9648dec108ef588357f Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:06:28 -0400 Subject: [PATCH 33/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index b4e08a32..83a83f7c 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1071,7 +1071,7 @@ def add_wiki(self, topic_list,target_results=10): def add_dialogs(self, input_folder=None): - """ Main method to add an AWS dialog transcript into a library + """Main method to add an AWS dialog transcript into a library. Parameters ---------- From f82facf45ec11fc948afd59dc210755338652f92 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:06:39 -0400 Subject: [PATCH 34/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 83a83f7c..5a44b97f 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1093,7 +1093,7 @@ def add_dialogs(self, input_folder=None): def add_image(self, input_folder=None): - """ Main method to add image and scanned OCR content into a library + """Main method to add image and scanned OCR content into a library. Parameters ---------- From b475824dc3ba24324993d87c6436f40796cefe47 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:06:51 -0400 Subject: [PATCH 35/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 5a44b97f..8ba77be6 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1115,7 +1115,7 @@ def add_image(self, input_folder=None): def add_pdf_by_ocr(self, input_folder=None): - """ Alternative method to ingest PDFs that are scanned, or can not be otherwise parsed + """Alternative method to ingest PDFs that are scanned, or can not be otherwise parsed. Parameters ---------- From a72f206b5f5f9b407a500bc7146b5c0fcc400e79 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:07:04 -0400 Subject: [PATCH 36/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 8ba77be6..860a3e8a 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1137,7 +1137,7 @@ def add_pdf_by_ocr(self, input_folder=None): def add_pdf(self, input_folder=None): - """ Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. + """Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. Parameters ---------- From 993b5a69499676780b21cc38cf0c2143d18ec0fe Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:07:15 -0400 Subject: [PATCH 37/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 860a3e8a..41e76155 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1159,7 +1159,7 @@ def add_pdf(self, input_folder=None): def add_office(self, input_folder=None): - """ Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. + """Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. Parameters ---------- From 205ae04ef90f66e7a871003f89f78dfb368f8ba9 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:07:26 -0400 Subject: [PATCH 38/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 41e76155..b7214f5c 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1181,7 +1181,7 @@ def add_office(self, input_folder=None): def get_all_library_cards(self, account_name='llmware'): - """ Get all library cards for all libraries on account + """Get all library cards for all libraries on account. Parameters ---------- From d24bf2e47660d4cb2baf83cdda23f1e6ab628837 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:07:38 -0400 Subject: [PATCH 39/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index b7214f5c..9486956a 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1199,7 +1199,7 @@ def get_all_library_cards(self, account_name='llmware'): def delete_installed_embedding(self, embedding_model_name, vector_db, vector_db_api_key=None): - """ Deletes an installed embedding on specific combination of vector_db + embedding_model_name + """Deletes an installed embedding on specific combination of vector_db + embedding_model_name. Parameters ---------- From 1153f2c4a9351ae19050cd7a12f625af12bf4060 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:07:49 -0400 Subject: [PATCH 40/48] Update llmware/library.py Co-authored-by: MacOS --- llmware/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/library.py b/llmware/library.py index 9486956a..4c811da6 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1243,7 +1243,7 @@ def delete_installed_embedding(self, embedding_model_name, vector_db, vector_db_ def run_ocr_on_images(self, add_to_library=False,chunk_size=400,min_size=10, realtime_progress=True): - """ Convenience method in Library class to pass Library to Parser to run OCR on all of the images + """Convenience method in Library class to pass Library to Parser to run OCR on all of the images found in the Library, and OCR-extracted text from the images directly into the Library as additional blocks. From 83920fbac1241ee37609c922f4cc6731bb5df310 Mon Sep 17 00:00:00 2001 From: Will Taner <72456136+willtaner@users.noreply.github.com> Date: Wed, 29 May 2024 21:11:23 -0400 Subject: [PATCH 41/48] Update library.py, removing blank lines --- llmware/library.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/llmware/library.py b/llmware/library.py index 4c811da6..75df078a 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -131,7 +131,6 @@ def __init__(self): # explicit constructor to create a new library def create_new_library(self, library_name, account_name="llmware"): - """Explicit constructor to create a new library with selected name. If a library with the same name already exists, it will load the existing library. @@ -256,7 +255,6 @@ def create_new_library(self, library_name, account_name="llmware"): return self def load_library(self, library_name, account_name="llmware"): - """Load an existing library by invoking the library string name. Parameters @@ -305,7 +303,6 @@ def load_library(self, library_name, account_name="llmware"): return self def get_library_card(self, library_name=None, account_name="llmware"): - """Retrieves the library card dictionary with key attributes of library. Parameters @@ -341,7 +338,6 @@ def get_library_card(self, library_name=None, account_name="llmware"): return library_card def check_if_library_exists(self, library_name, account_name="llmware"): - """Check if library exists by library string name. Parameters @@ -384,7 +380,6 @@ def check_if_library_exists(self, library_name, account_name="llmware"): def update_embedding_status (self, status_message, embedding_model, embedding_db, embedded_blocks=0, embedding_dims=0,time_stamp="NA",delete_record=False): - """Invoked at the end of the embedding job to update the library card and embedding record -- generally, this method does not need to be invoked directly. @@ -436,7 +431,6 @@ def update_embedding_status (self, status_message, embedding_model, embedding_db return True def get_embedding_status (self): - """Pulls the embedding record for the current library from the library card. Returns @@ -465,7 +459,6 @@ def get_embedding_status (self): return embedding_record def get_knowledge_graph_status (self): - """Gets the status of creating the knowledge graph for the current library from the library card. Returns @@ -485,7 +478,6 @@ def get_knowledge_graph_status (self): return status_message def set_knowledge_graph_status (self, status_message): - """Updates the knowledge graph status on the library card after creating a knowledge graph. Parameters @@ -505,7 +497,6 @@ def set_knowledge_graph_status (self, status_message): return True def get_and_increment_doc_id(self): - """Convenience method in library class - mirrors method in LibraryCatalog - increments, tracks and provides a unique doc id for the library. @@ -520,7 +511,6 @@ def get_and_increment_doc_id(self): def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added_images=0, added_pages=0, added_tables=0): - """Updates the library card with incremental counters after completing a parsing job. Parameters @@ -556,7 +546,6 @@ def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added return True def add_file(self, file_path): - """Ingests, parses, text chunks and indexes a single selected file to a library - provide the full path to file. @@ -584,7 +573,6 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, get_images=True,get_tables=True, smart_chunking=1, max_chunk_size=600, table_grid=True, get_header_text=True, table_strategy=1, strip_header=False, verbose_level=2, copy_files_to_library=True): - """Main method to integrate documents into a Library - pass a local filepath folder and all files will be routed to appropriate parser by file type extension. @@ -684,7 +672,6 @@ def add_files (self, input_folder_path=None, encoding="utf-8",chunk_size=400, def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_text=True, include_tables=True, include_images=False): - """Exports library collection of indexed text chunks to a txt file. Parameters @@ -741,7 +728,6 @@ def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_tex def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, include_tables=True, include_images=False, dict_keys=None): - """Exports collection of text chunks to a jsonl file. Parameters @@ -813,7 +799,6 @@ def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, return file_location def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None, bucket_name=None): - """Pull files from private S3 bucket into local cache for further processing. Parameters @@ -839,7 +824,6 @@ def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None return files_copied def generate_knowledge_graph(self): - """Builds a statistical co-occurrence matrix for a library. Returns @@ -856,7 +840,6 @@ def generate_knowledge_graph(self): def install_new_embedding (self, embedding_model_name=None, vector_db=None, from_hf= False, from_sentence_transformer=False, model=None, tokenizer=None, model_api_key=None, vector_db_api_key=None, batch_size=500, max_len=None, use_gpu=True): - """Main method for installing a new embedding on a library. Parameters @@ -947,7 +930,6 @@ def install_new_embedding (self, embedding_model_name=None, vector_db=None, return embeddings def delete_library(self, library_name=None, confirm_delete=False): - """Deletes all artifacts of a library. Parameters @@ -991,7 +973,6 @@ def delete_library(self, library_name=None, confirm_delete=False): return success_code def update_block (self, doc_id, block_id, key, new_value): - """Convenience method to update the record of a specific block - identified by doc_ID and block_ID in text collection database. @@ -1021,7 +1002,6 @@ def update_block (self, doc_id, block_id, key, new_value): return completed def add_website (self, url, get_links=True, max_links=5): - """Main method to ingest a website into a library. Parameters @@ -1047,7 +1027,6 @@ def add_website (self, url, get_links=True, max_links=5): return self def add_wiki(self, topic_list,target_results=10): - """Main method to add a wikipedia article to a library - enter a list of topics. Parameters @@ -1070,7 +1049,6 @@ def add_wiki(self, topic_list,target_results=10): return self def add_dialogs(self, input_folder=None): - """Main method to add an AWS dialog transcript into a library. Parameters @@ -1092,7 +1070,6 @@ def add_dialogs(self, input_folder=None): return self def add_image(self, input_folder=None): - """Main method to add image and scanned OCR content into a library. Parameters @@ -1114,7 +1091,6 @@ def add_image(self, input_folder=None): return self def add_pdf_by_ocr(self, input_folder=None): - """Alternative method to ingest PDFs that are scanned, or can not be otherwise parsed. Parameters @@ -1136,7 +1112,6 @@ def add_pdf_by_ocr(self, input_folder=None): return self def add_pdf(self, input_folder=None): - """Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. Parameters @@ -1158,7 +1133,6 @@ def add_pdf(self, input_folder=None): return self def add_office(self, input_folder=None): - """Convenience method to directly add PDFs only - note, in most cases, 'add_files' is the better option. Parameters @@ -1180,7 +1154,6 @@ def add_office(self, input_folder=None): return self def get_all_library_cards(self, account_name='llmware'): - """Get all library cards for all libraries on account. Parameters @@ -1198,7 +1171,6 @@ def get_all_library_cards(self, account_name='llmware'): return library_cards def delete_installed_embedding(self, embedding_model_name, vector_db, vector_db_api_key=None): - """Deletes an installed embedding on specific combination of vector_db + embedding_model_name. Parameters @@ -1242,7 +1214,6 @@ def delete_installed_embedding(self, embedding_model_name, vector_db, vector_db_ return 1 def run_ocr_on_images(self, add_to_library=False,chunk_size=400,min_size=10, realtime_progress=True): - """Convenience method in Library class to pass Library to Parser to run OCR on all of the images found in the Library, and OCR-extracted text from the images directly into the Library as additional blocks. From 4e8cae5382b5378bde35dfe7595299cb1a44680f Mon Sep 17 00:00:00 2001 From: DARREN OBERST Date: Thu, 30 May 2024 05:57:03 -0400 Subject: [PATCH 42/48] fixing library delete error --- llmware/library.py | 27 ++++++++--------------- llmware/resources.py | 51 ++++++++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/llmware/library.py b/llmware/library.py index 64f7e143..525bf784 100644 --- a/llmware/library.py +++ b/llmware/library.py @@ -1,4 +1,4 @@ -# Copyright 2023 llmware +# Copyright 2023-2024 llmware # Licensed under the Apache License, Version 2.0 (the "License"); you # may not use this file except in compliance with the License. You @@ -14,7 +14,7 @@ """The library module implements the logic for managing unstructured information (the text). -The module implements the two classes Library and LibraryCatalog. Library is responsible for organising a +The module implements the two classes Library and LibraryCatalog. Library is responsible for organizing a collection of text and is the interface for the Parser and Embedding classes. In addition, the Library object is passed to the Query and Prompt objects. The Library class uses the LibraryCatalog for creating, deleting, updating, and other tasks pertaining to Libraries via the Library Card. @@ -38,28 +38,16 @@ class Library: - """Implements the interface to manage a collection of texts and images as a ``Library``. - ``Library`` is responsible for managing a collection of unstructured inofrmation, i.e. a library is a - collection of texts and images. + """Implements the interface to manage a collection of unstructured information as a ``Library``, i.e. a + library is an indexed collection of texts, tables and images extracted from parsed files. Returns ------- library : Library A new ``Library`` object. - - Examples - ---------- - >>> import os - >>> import llmware.library - >>> import llmware.setup - >>> sample_files_path = llmware.setup.Setup().load_sample_files(over_write=True) - >>> agreements_path = os.path.join(sample_files_path, 'Agreements') - >>> library = llmare.library.Library().create_new_library('my-new-library') - >>> library.add_files(agreements_path) - >>> library_card = library.get_library_card() - >>> library_card['documents'] """ + def __init__(self): # default settings for basic parameters @@ -631,13 +619,16 @@ def install_new_embedding (self, embedding_model_name=None, vector_db=None, return embeddings - def delete_library(self, library_name=None, confirm_delete=False): + def delete_library(self, library_name=None, confirm_delete=False, account_name="llmware"): """ Deletes all artifacts of a library """ if library_name: self.library_name = library_name + # loads the library specific path information if required + self.load_library(library_name,account_name=account_name) + success_code = 1 try: diff --git a/llmware/resources.py b/llmware/resources.py index 55328e4e..945019b6 100644 --- a/llmware/resources.py +++ b/llmware/resources.py @@ -1,4 +1,4 @@ -# Copyright 2023 llmware +# Copyright 2023-2024 llmware # Licensed under the Apache License, Version 2.0 (the "License"); you # may not use this file except in compliance with the License. You @@ -11,10 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. See the License for the specific language governing # permissions and limitations under the License. -"""The resources module implements the text index databases that are used in conjunction with the vector -databases. -Currently, llmware supports MongoDB, Postgres, and SQLite as text index databases. +"""The resources module implements the text index databases that are used as the foundation for creating a +Library in LLMWare, and a wide range of supporting methods, including text query retrieval, library card + management, tracking of embedding progress and status, and the ability to create custom tables. The text index + is used as the 'master' source of aggregating and access unstructured information that has been parsed and + organized into Library collections. + +Currently, llmware supports MongoDB, Postgres, and SQLite as text index databases, and supports the use of both +Postgres and SQLIte for creation of custom (SQL) tables. + """ import platform @@ -1636,16 +1642,23 @@ def destroy_collection(self, confirm_destroy=False): sql_instruction = f"DROP TABLE {self.library_name};" - results = self.conn.cursor().execute(sql_instruction) - self.conn.commit() - self.conn.close() + # returns TRUE if table does not exist & FALSE if table exists + table_does_not_exist = self.check_if_table_build_required() - return 1 + # if FALSE ... drop the table + if not table_does_not_exist: + results = self.conn.cursor().execute(sql_instruction) + self.conn.commit() + self.conn.close() + return 1 + else: + logging.warning(f"update: PGWriter - request to drop table not executed because table " + f"could not be found in the database.") + return -1 logging.warning("update: library not destroyed - need to set confirm_destroy = True") - self.conn.commit() - + # self.conn.commit() self.conn.close() return 0 @@ -2748,10 +2761,20 @@ def destroy_collection(self, confirm_destroy=False): if confirm_destroy: sql_instruction = f"DROP TABLE {self.library_name};" - results = self.conn.cursor().execute(sql_instruction) - self.conn.commit() - self.conn.close() - return 1 + + # returns TRUE if table does not exist & FALSE if table exists + table_does_not_exist = self.check_if_table_build_required() + + # if FALSE ... drop the table + if not table_does_not_exist: + results = self.conn.cursor().execute(sql_instruction) + self.conn.commit() + self.conn.close() + return 1 + else: + logging.warning(f"update: SQLiteWriter - request to drop table not executed because table " + f"could not be found in the database.") + return -1 logging.warning("update: library not destroyed - need to set confirm_destroy = True") self.conn.close() From d49b8468a7a75c5ca5cd73ed1a4ad8c71c8b77c7 Mon Sep 17 00:00:00 2001 From: DARREN OBERST Date: Thu, 30 May 2024 12:58:05 -0400 Subject: [PATCH 43/48] updated docker-compose script for mongo + milvus install --- docker-compose_mongo_milvus.yaml | 83 ++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 docker-compose_mongo_milvus.yaml diff --git a/docker-compose_mongo_milvus.yaml b/docker-compose_mongo_milvus.yaml new file mode 100644 index 00000000..c7356db9 --- /dev/null +++ b/docker-compose_mongo_milvus.yaml @@ -0,0 +1,83 @@ +version: "3.5" + +services: + mongodb: + container_name: mongodb + image: mongo:5.0.10 + # To secure MongoDB, uncomment and set the following values + # environment: + # - MONGO_INITDB_DATABASE=admin + # - MONGO_INITDB_ROOT_USERNAME=admin + # - MONGO_INITDB_ROOT_PASSWORD=changeme + volumes: + - llmware-mongodb:/data/db + ports: + - '27017:27017' + + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.5 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - llmware-etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "9001:9001" + - "9000:9000" + volumes: + - llmware-minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + milvus: + container_name: milvus + image: milvusdb/milvus:v2.3.0 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - llmware-milvus:/var/lib/milvus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + start_period: 90s + timeout: 20s + retries: 3 + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +volumes: + llmware-mongodb: + driver: local + llmware-etcd: + driver: local + llmware-minio: + driver: local + llmware-milvus: + driver: local + From 84c9faea15685e2dff35953a0f47629f88c2c25d Mon Sep 17 00:00:00 2001 From: Darren Oberst <41238031+doberst@users.noreply.github.com> Date: Thu, 30 May 2024 13:25:40 -0400 Subject: [PATCH 44/48] Update requirements.txt --- llmware/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmware/requirements.txt b/llmware/requirements.txt index 385341dd..689b4c6b 100644 --- a/llmware/requirements.txt +++ b/llmware/requirements.txt @@ -1,4 +1,4 @@ -boto3==1.24.53 +boto3>=1.24.53 numpy>=1.23.2 openai>=1.0 pymongo>=4.7.0 From 484d9cb8e8b2b540fa7a87e5a254632b76a994c2 Mon Sep 17 00:00:00 2001 From: Darren Oberst <41238031+doberst@users.noreply.github.com> Date: Thu, 30 May 2024 13:27:16 -0400 Subject: [PATCH 45/48] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 739aef9d..4bc26974 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def glob_fix(package_name, glob): python_requires=">=3.9", zip_safe=True, install_requires=[ - 'boto3==1.24.53', + 'boto3>=1.24.53', 'huggingface-hub>=0.19.4', 'numpy>=1.23.2', 'openai>=1.0.0', From 99d3e1e5a799959511642e84445bfd57a7bdcecf Mon Sep 17 00:00:00 2001 From: DARREN OBERST Date: Thu, 30 May 2024 16:16:43 -0400 Subject: [PATCH 46/48] updates to embedding module --- examples/Embedding/using_milvus_lite.py | 145 ++++ llmware/configs.py | 79 ++- llmware/embeddings.py | 850 ++++++++++++++++-------- 3 files changed, 797 insertions(+), 277 deletions(-) create mode 100644 examples/Embedding/using_milvus_lite.py diff --git a/examples/Embedding/using_milvus_lite.py b/examples/Embedding/using_milvus_lite.py new file mode 100644 index 00000000..b7613630 --- /dev/null +++ b/examples/Embedding/using_milvus_lite.py @@ -0,0 +1,145 @@ + +"""" This example is a fast start with Milvus Lite, which is a 'no-install' file-based version of Milvus, intended +for rapid prototyping. A couple of key points to note: + + -- Platform - per Milvus docs, Milvus Lite is designed for Mac and Linux (not on Windows currently) + -- PyMilvus - need to `pip install pymilvus>=2.4.2` + -- within LLMWare: set MilvusConfig("lite", True) +""" + +import os +from llmware.library import Library +from llmware.retrieval import Query +from llmware.setup import Setup +from llmware.status import Status +from llmware.models import ModelCatalog +from llmware.configs import LLMWareConfig, MilvusConfig + +from importlib import util + +if not util.find_spec("pymilvus"): + print("\nto run this example with pymilvus, you need to install pymilvus: pip3 install pymilvus>=2.4.2") + + +def setup_library(library_name): + + """ Note: this setup_library method is provided to enable a self-contained example to create a test library """ + + # Step 1 - Create library which is the main 'organizing construct' in llmware + print ("\nupdate: Creating library: {}".format(library_name)) + + library = Library().create_new_library(library_name) + + # check the embedding status 'before' installing the embedding + embedding_record = library.get_embedding_status() + print("embedding record - before embedding ", embedding_record) + + # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command + # --note: if you need to refresh the sample files, set 'over_write=True' + print ("update: Downloading Sample Files") + + sample_files_path = Setup().load_sample_files(over_write=False) + + # Step 3 - point ".add_files" method to the folder of documents that was just created + # this method parses the documents, text chunks, and captures in database + + print("update: Parsing and Text Indexing Files") + + library.add_files(input_folder_path=os.path.join(sample_files_path, "Agreements"), + chunk_size=400, max_chunk_size=600, smart_chunking=1) + + return library + + +def install_vector_embeddings(library, embedding_model_name): + + """ This method is the core example of installing an embedding on a library. + -- two inputs - (1) a pre-created library object and (2) the name of an embedding model """ + + library_name = library.library_name + vector_db = LLMWareConfig().get_vector_db() + + print(f"\nupdate: Starting the Embedding: " + f"library - {library_name} - " + f"vector_db - {vector_db} - " + f"model - {embedding_model_name}") + + # *** this is the one key line of code to create the embedding *** + library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db,batch_size=100) + + # note: for using llmware as part of a larger application, you can check the real-time status by polling Status() + # --both the EmbeddingHandler and Parsers write to Status() at intervals while processing + update = Status().get_embedding_status(library_name, embedding_model) + print("update: Embeddings Complete - Status() check at end of embedding - ", update) + + # Start using the new vector embeddings with Query + sample_query = "incentive compensation" + print("\n\nupdate: Run a sample semantic/vector query: {}".format(sample_query)) + + # queries are constructed by creating a Query object, and passing a library as input + query_results = Query(library).semantic_query(sample_query, result_count=20) + + for i, entries in enumerate(query_results): + + # each query result is a dictionary with many useful keys + + text = entries["text"] + document_source = entries["file_source"] + page_num = entries["page_num"] + vector_distance = entries["distance"] + + # to see all of the dictionary keys returned, uncomment the line below + # print("update: query_results - all - ", i, entries) + + # for display purposes only, we will only show the first 125 characters of the text + if len(text) > 125: text = text[0:125] + " ... " + + print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} " + .format( i, document_source, page_num, vector_distance)) + + print("update: text sample - ", text) + + # lets take a look at the library embedding status again at the end to confirm embeddings were created + embedding_record = library.get_embedding_status() + + print("\nupdate: embedding record - ", embedding_record) + + return 0 + + +if __name__ == "__main__": + + # Fast Start configuration - will use no-install embedded sqlite + # -- if you have installed Mongo or Postgres, then change the .set_active_db accordingly + + LLMWareConfig().set_active_db("sqlite") + + # set the "lite" flag in MilvusConfig to True -> to use server version, set to False (which is default) + MilvusConfig().set_config("lite", True) + LLMWareConfig().set_vector_db("milvus") + + # Step 1 - create library + library = setup_library("ex2_milvus_lite") + + # Step 2 - Select any embedding model in the LLMWare catalog + + # to see a list of the embedding models supported, uncomment the line below and print the list + embedding_models = ModelCatalog().list_embedding_models() + + # for i, models in enumerate(embedding_models): + # print("embedding models: ", i, models) + + # for this first embedding, we will use a very popular and fast sentence transformer + embedding_model = "mini-lm-sbert" + + # note: if you want to swap out "mini-lm-sbert" for Open AI 'text-embedding-ada-002', uncomment these lines: + # embedding_model = "text-embedding-ada-002" + # os.environ["USER_MANAGED_OPENAI_API_KEY"] = "" + + # run the core script + install_vector_embeddings(library, embedding_model) + + + + + diff --git a/llmware/configs.py b/llmware/configs.py index 272917dd..8e725a36 100644 --- a/llmware/configs.py +++ b/llmware/configs.py @@ -29,9 +29,38 @@ from colorama import Fore COLOR_WHITE = Fore.WHITE COLOR_RESET = Fore.RESET + COLOR_RED = Fore.RED + COLOR_YELLOW = Fore.YELLOW + COLOR_GREEN= Fore.GREEN + COLOR_BLUE = Fore.BLUE except: COLOR_WHITE = "" COLOR_RESET = "" + COLOR_RED = "" + COLOR_YELLOW = "" + COLOR_GREEN = "" + COLOR_BLUE = "" + + +class CustomFormatter(logging.Formatter): + + """ CustomFormatter - Configuration of global logging formatting - WIP. """ + + format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)" + + FORMATS = { + logging.DEBUG: COLOR_GREEN + format + COLOR_RESET, + logging.INFO: COLOR_WHITE + format + COLOR_RESET, + logging.WARNING: COLOR_YELLOW + format + COLOR_YELLOW, + logging.ERROR: COLOR_RED + format + COLOR_RESET, + logging.CRITICAL: COLOR_RED + format + COLOR_RESET, + 25: COLOR_BLUE + format + COLOR_RESET + } + + def format(self, record): + log_fmt = self.FORMATS.get(record.levelno) + formatter = logging.Formatter(log_fmt) + return formatter.format(record) class LLMWareConfig: @@ -67,7 +96,8 @@ class LLMWareConfig: "llmware_public_models_bucket": "llmware-public-models", "shared_lib_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "lib"), "logging_level": logging.WARNING, - "logging_format": COLOR_WHITE + '%(levelname)-4s: %(message)s' + COLOR_RESET + "logging_format": COLOR_WHITE + '%(levelname)-4s: %(message)s' + COLOR_RESET, + "logging_level_by_module": {"llmware.embeddings": 20, "llmware.models": 30} } @classmethod @@ -387,6 +417,45 @@ def set_logging_format(cls, formatting_str): cls._conf["logging_format"] = formatting_str return True + @classmethod + def get_logging_level_by_module(cls, module): + if module in cls._conf["logging_level_by_module"]: + return cls._conf["logging_level_by_module"][module] + else: + raise ConfigKeyException(module) + + +class VectorDBRegistry: + + """ Registry of supported Vector DBs, and the class module used to interface with the DB. """ + + vector_db_list = {"milvus": {"module": "llmware.embeddings", "class": "EmbeddingMilvus"}, + "chromadb": {"module": "llmware.embeddings", "class": "EmbeddingChromaDB"}, + "qdrant": {"module": "llmware.embeddings", "class": "EmbeddingQdrant"}, + "postgres": {"module": "llmware.embeddings", "class": "EmbeddingPGVector"}, + "pg_vector": {"module": "llmware.embeddings", "class": "EmbeddingPGVector"}, + "redis": {"module": "llmware.embeddings", "class": "EmbeddingRedis"}, + "neo4j": {"module": "llmware.embeddings", "class": "EmbeddingNeo4j"}, + "lancedb": {"module": "llmware.embeddings", "class": "EmbeddingLanceDB"}, + "faiss": {"module": "llmware.embeddings", "class": "EmbeddingFAISS"}, + "pinecone": {"module": "llmware.embeddings", "class": "EmbeddingPinecone"}, + "mongo_atlas": {"module": "llmware.embeddings", "class": "EmbeddingMongoAtlas"} + } + @classmethod + def get_vector_db_list(cls): + """ List current view of implemented supported vector db for embeddings. """ + return cls.vector_db_list + + @classmethod + def add_vector_db(cls, db_name, vector_db_class, module="llmware.embeddings"): + """ Adds a vector db including the module and class. """ + new_entry = {db_name: {"module": module, "class": vector_db_class}} + cls.vector_db_list.update(new_entry) + return True + + +logging.basicConfig(format=LLMWareConfig().get_logging_format(), level=LLMWareConfig().get_logging_level()) + class MilvusConfig: @@ -395,7 +464,13 @@ class MilvusConfig: _conf = {"host": os.environ.get("MILVUS_HOST", "localhost"), "port": os.environ.get("MILVUS_PORT", 19530), "db_name": os.environ.get("MILVUS_DB", "default"), - "partitions": []} + "partitions": [], + + # new attributes to support embedded milvus lite + "lite": False, + "lite_folder_path": LLMWareConfig().get_library_path(), + "lite_name": "milvus_lite.db", + } @classmethod def get_config(cls, name): diff --git a/llmware/embeddings.py b/llmware/embeddings.py index b7c48977..4c59e810 100644 --- a/llmware/embeddings.py +++ b/llmware/embeddings.py @@ -1,4 +1,4 @@ -# Copyright 2023 llmware +# Copyright 2023-2024 llmware # Licensed under the Apache License, Version 2.0 (the "License"); you # may not use this file except in compliance with the License. You @@ -19,7 +19,6 @@ _EmbeddingUtils class, which provides a set of functions used by all vector database classes. """ - import os import logging import numpy as np @@ -28,80 +27,63 @@ import uuid import itertools from importlib import util - -from pymongo import MongoClient - -try: - from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection -except ImportError: - pass - -try: - import faiss -except ImportError: - pass - -# note: update- adding psycopg and postgres to core llmware package in version 0.2.0 -try: - from pgvector.psycopg import register_vector - import psycopg -except ImportError: - pass - -# optional imports of redis - not in project requirements -try: - import redis - from redis.commands.search.field import TagField, TextField, NumericField - from redis.commands.search.indexDefinition import IndexDefinition, IndexType - from redis.commands.search.query import Query - from redis.commands.search.field import VectorField -except ImportError: - pass - -# optional imports of qdrant - not in project requirements -try: - from qdrant_client import QdrantClient - from qdrant_client.http.models import Distance, VectorParams, PointStruct -except ImportError: - pass - -# optional import of pinecone - not in project requirements -try: - from pinecone import Pinecone, ServerlessSpec -except ImportError: - pass - -# optional import of lancedb - not in project requirements -try: - import lancedb -except ImportError: - pass - -# optional import of neo4j - not in project requirements -try: - import neo4j - from neo4j import GraphDatabase -except: - pass - -# optional import of chromadb - not in project requirements -try: - import chromadb -except: - pass - +import importlib from llmware.configs import LLMWareConfig, MongoConfig, MilvusConfig, PostgresConfig, RedisConfig, \ - PineconeConfig, QdrantConfig, Neo4jConfig, LanceDBConfig, ChromaDBConfig + PineconeConfig, QdrantConfig, Neo4jConfig, LanceDBConfig, ChromaDBConfig, VectorDBRegistry from llmware.exceptions import (UnsupportedEmbeddingDatabaseException, EmbeddingModelNotFoundException, - DependencyNotInstalledException) + DependencyNotInstalledException, LLMWareException) from llmware.resources import CollectionRetrieval, CollectionWriter from llmware.status import Status from llmware.util import Utilities +""" By default, no vector db drivers are loaded into global program space unless and until they are invoked. Within +each embedding class handler, there is a check if GLOBAL_{VECTOR_DB}_IMPORT is False, and if so, then the module +is loaded, and the GLOBAL_{VECTOR_DB}_IMPORT is set to True. """ + +pymilvus = None +GLOBAL_PYMILVUS_IMPORT = False + +chromadb = None +GLOBAL_CHROMADB_IMPORT = False + +lancedb = None +GLOBAL_LANCEDB_IMPORT = False + +faiss = None +GLOBAL_FAISS_IMPORT = False + +neo4j = None +GLOBAL_NEO4J_IMPORT = False + +qdrant_client = None +GLOBAL_QDRANT_IMPORT = False + +pinecone = None +GLOBAL_PINECONE_IMPORT = False + +redis = None +GLOBAL_REDIS_IMPORT = False + +# pgvector requires import of both pgvector and psycopg +pgvector = None +GLOBAL_PGVECTOR_IMPORT = False + +psycopg = None +GLOBAL_PSYCOPG_IMPORT = False + +# used in mongo-atlas +pymongo = None +GLOBAL_PYMONGO_IMPORT = False + +logger = logging.getLogger(__name__) +log_level = LLMWareConfig().get_logging_level_by_module("llmware.embeddings") +logger.setLevel(level=log_level) + class EmbeddingHandler: - """Provides an interface to all supported vector dabases, which is used by the ``Library`` class. + + """Provides an interface to all supported vector databases, which is used by the ``Library`` class. ``EmbeddingHandler`` is responsible for embedding-related interactions between a library and a vector store. This includes creating, reading, updating, and deleting (CRUD) embeddings. The ``EmbeddingHandler``, @@ -118,9 +100,12 @@ class EmbeddingHandler: embedding_handler : EmbeddingHandler A new ``EmbeddingHandler`` object. """ + def __init__(self, library): - self.supported_embedding_dbs = LLMWareConfig().get_supported_vector_db() + # self.supported_embedding_dbs = LLMWareConfig().get_supported_vector_db() + self.supported_embedding_dbs = VectorDBRegistry().get_vector_db_list() + self.library = library def create_new_embedding(self, embedding_db, model, doc_ids=None, batch_size=500): @@ -138,7 +123,7 @@ def create_new_embedding(self, embedding_db, model, doc_ids=None, batch_size=500 embedded_blocks = embedding_status["embedded_blocks"] else: embedded_blocks = -1 - logging.warning("update: embedding_handler - unable to determine if embeddings have " + logger.warning("update: embedding_handler - unable to determine if embeddings have " "been properly counted and captured. Please check if databases connected.") self.library.update_embedding_status("yes", model.model_name, embedding_db, @@ -180,47 +165,22 @@ def _load_embedding_db(self, embedding_db, model=None, model_name=None, embeddin if not embedding_db in self.supported_embedding_dbs: raise UnsupportedEmbeddingDatabaseException(embedding_db) - - if embedding_db == "milvus": - return EmbeddingMilvus(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "faiss": - return EmbeddingFAISS(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "pinecone": - return EmbeddingPinecone(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "mongo_atlas": - return EmbeddingMongoAtlas(self.library, model=model,model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "redis": - return EmbeddingRedis(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "qdrant": - return EmbeddingQdrant(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "lancedb": - return EmbeddingLanceDB(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - # note: pg_vector == postgres (two aliases provided) - if embedding_db in ["pg_vector", "postgres"]: - return EmbeddingPGVector(self.library,model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "neo4j": - return EmbeddingNeo4j(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) - - if embedding_db == "chromadb": - return EmbeddingChromaDB(self.library, model=model, model_name=model_name, - embedding_dims=embedding_dims) + + vdb = self.supported_embedding_dbs[embedding_db] + + # dynamically load the module/class for the specific embedding handler + vdb_module = vdb["module"] + vdb_class = vdb["class"] + vdb_module = importlib.import_module(vdb_module) + + if hasattr(vdb_module, vdb_class): + model_class = getattr(vdb_module, vdb_class) + + return model_class(self.library, model=model, model_name=model_name,embedding_dims=embedding_dims) + + else: + raise LLMWareException(message=f"Exception: could not find class implementation for {embedding_db}, which " + f"is expected at: {vdb_module} - {vdb_class}.") def generate_index_name(self, account_name, library_name, model_name, max_component_length=19): @@ -242,6 +202,7 @@ def generate_index_name(self, account_name, library_name, model_name, max_compon class _EmbeddingUtils: + """Provides functions to vector stores, such as creating names for the text collection database as well as creating names for vector such, and creating a summary of an embedding process. @@ -271,6 +232,7 @@ class _EmbeddingUtils: embedding_utils : _EmbeddingUtils A new ``_EmbeddingUtils`` object. """ + def __init__(self, library_name=None, model_name=None, account_name=None,db_name=None, embedding_dims=None): @@ -343,8 +305,6 @@ def generate_embedding_summary(self, embeddings_created): "embedding_dims": self.embedding_dims, "time_stamp": Utilities().get_current_time_now()} - # print("update: embedding_summary - ", embedding_summary) - return embedding_summary def update_text_index(self, block_ids, current_index): @@ -391,9 +351,8 @@ def unset_text_index(self): class EmbeddingMilvus: - """Implements the vector database Milvius. - - ``EmbeddingMivlus`` implements the interface to the ``Milvus`` vector store. It is used by the + """ + ``EmbeddingMilvus`` implements the interface to the ``Milvus`` vector store. It is used by the ``EmbeddingHandler``. Parameters @@ -423,15 +382,25 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): self.account_name = library.account_name self.milvus_alias = "default" - # Connect to milvus - # Instantiate client. - if not util.find_spec("pymilvus"): - raise DependencyNotInstalledException("pip3 install pymilvus") + self.use_milvus_lite = MilvusConfig().get_config("lite") + + # confirm that pymilvus installed + + global GLOBAL_PYMILVUS_IMPORT + if not GLOBAL_PYMILVUS_IMPORT: + if util.find_spec("pymilvus"): + + try: + global pymilvus + pymilvus = importlib.import_module("pymilvus") + GLOBAL_PYMILVUS_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load pymilvus module.") + + else: + raise LLMWareException(message="Exception: need to import pymilvus to use this class.") - connections.connect(self.milvus_alias, - host=MilvusConfig.get_config("host"), - port=MilvusConfig.get_config("port"), - db_name=MilvusConfig.get_config("db_name")) + # end dynamic import here # look up model card if not model and not model_name: @@ -455,23 +424,74 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): self.collection_name = self.utils.create_safe_collection_name() self.collection_key = self.utils.create_db_specific_key() - # if collection does not exist, create it - if not utility.has_collection(self.collection_name): - fields = [ - FieldSchema(name="block_mongo_id", dtype=DataType.VARCHAR, is_primary=True, max_length=30,auto_id=False), - FieldSchema(name="block_doc_id", dtype=DataType.INT64), - FieldSchema(name="embedding_vector", dtype=DataType.FLOAT_VECTOR, dim=self.embedding_dims) - ] + if self.use_milvus_lite: - collection = Collection(self.collection_name, CollectionSchema(fields)) - index_params = { - "metric_type": "L2", - "index_type": "IVF_FLAT", - "params": {"nlist": 1024} - } - collection.create_index("embedding_vector", index_params) + logger.info(f"update: EmbeddingHandler - Milvus - selecting 'lite' version. If you intend to use " + f"a server-based version of Milvus, please set: MilvusConfig().set_config('lite', False).") + + lite_path = MilvusConfig().get_config("lite_folder_path") + lite_db_name = MilvusConfig().get_config("lite_name") + + self.collection = pymilvus.MilvusClient(os.path.join(lite_path, lite_db_name)) + + # check if collection_name found in list of collections - load, if exists, else create new + if self.collection_name in self.collection.list_collections(): + self.collection.load_collection(self.collection_name) + else: + schema = self.collection.create_schema( + auto_id=False, + enable_dynamic_field=True, + ) - self.collection = Collection(self.collection_name) + # add fields to schema + schema.add_field(field_name="block_mongo_id", datatype=pymilvus.DataType.VARCHAR, is_primary=True, + max_length=30, auto_id=False) + schema.add_field(field_name="block_doc_id", datatype=pymilvus.DataType.INT64) + schema.add_field(field_name="embedding_vector", datatype=pymilvus.DataType.FLOAT_VECTOR, dim=self.embedding_dims) + + index_params = self.collection.prepare_index_params() + + # add index + index_params.add_index( + field_name="embedding_vector", + metric_type="L2", + ) + + self.collection.create_collection(collection_name=self.collection_name, + dimension=self.embedding_dims, + schema=schema, + index_params=index_params) + + else: + + # connect to Milvus server + + logger.info(f"update: EmbeddingHandler - Milvus - connecting to Milvus server instance. To use " + f"Milvus 'lite', set MilvusConfig().set_config('lite', True).") + + pymilvus.connections.connect(self.milvus_alias, + host=MilvusConfig.get_config("host"), + port=MilvusConfig.get_config("port"), + db_name=MilvusConfig.get_config("db_name")) + + if not pymilvus.utility.has_collection(self.collection_name): + fields = [ + pymilvus.FieldSchema(name="block_mongo_id", + dtype=pymilvus.DataType.VARCHAR, is_primary=True, max_length=30,auto_id=False), + pymilvus.FieldSchema(name="block_doc_id", dtype=pymilvus.DataType.INT64), + pymilvus.FieldSchema(name="embedding_vector", dtype=pymilvus.DataType.FLOAT_VECTOR, + dim=self.embedding_dims) + ] + + collection = pymilvus.Collection(self.collection_name, pymilvus.CollectionSchema(fields)) + index_params = { + "metric_type": "L2", + "index_type": "IVF_FLAT", + "params": {"nlist": 1024} + } + collection.create_index("embedding_vector", index_params) + + self.collection = pymilvus.Collection(self.collection_name) def create_new_embedding(self, doc_ids = None, batch_size=500): @@ -487,8 +507,6 @@ def create_new_embedding(self, doc_ids = None, batch_size=500): current_index = 0 finished = False - # all_blocks_iter = iter(all_blocks_cursor) - while not finished: block_ids, doc_ids, sentences = [], [], [] @@ -510,12 +528,24 @@ def create_new_embedding(self, doc_ids = None, batch_size=500): block_ids.append(str(block["_id"])) doc_ids.append(int(block["doc_ID"])) sentences.append(text_search) - + if len(sentences) > 0: + # Process the batch vectors = self.model.embedding(sentences) data = [block_ids, doc_ids, vectors] - self.collection.insert(data) + + if self.use_milvus_lite: + + d=[] + for i, vec in enumerate(vectors): + new_row = {"block_mongo_id": block_ids[i], "block_doc_id": doc_ids[i], "embedding_vector": vec} + d.append(new_row) + + self.collection.insert(data=d, collection_name=self.collection_name) + + else: + self.collection.insert(data) current_index = self.utils.update_text_index(block_ids,current_index) @@ -524,44 +554,80 @@ def create_new_embedding(self, doc_ids = None, batch_size=500): status.increment_embedding_status(self.library_name, self.model_name, len(sentences)) # will add configuration options to show/display - print (f"update: embedding_handler - Milvus - Embeddings Created: {embeddings_created} of {num_of_blocks}") - - self.collection.flush() + logger.info(f"update: embedding_handler - Milvus - Embeddings Created: {embeddings_created} of {num_of_blocks}") + + if not self.use_milvus_lite: + self.collection.flush() embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - Milvus - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - Milvus - embedding_summary - {embedding_summary}") return embedding_summary def search_index(self, query_embedding_vector, sample_count=10): - self.collection.load() + if not self.use_milvus_lite: + self.collection.load() - search_params = { - "metric_type": "L2", - "params": {"nprobe": 10} - } + search_params = { + "field_name": "embedding_vector", + "metric_type": "L2", + "params": {"nprobe": 10} + } - # TODO: add optional / configurable partitions + # TODO: add optional / configurable partitions - result = self.collection.search( - data=[query_embedding_vector], - anns_field="embedding_vector", - param=search_params, - limit=sample_count, - output_fields=["block_mongo_id"] - ) + result = self.collection.search( + data=[query_embedding_vector], + anns_field="embedding_vector", + param=search_params, + limit=sample_count, + output_fields=["block_mongo_id"] + ) + + else: + + search_params = { + "field_name": "embedding_vector", + "metric_type": "L2", + # "params": {"nprobe": 10} + } + + result = self.collection.search(collection_name=self.collection_name, + data=[query_embedding_vector], + anns_field="embedding_vector", + search_params=search_params, + limit=sample_count, + output_fields=["block_mongo_id"] + ) block_list = [] for hits in result: for hit in hits: - _id = hit.entity.get('block_mongo_id') + + if self.use_milvus_lite: + + try: + # _id = int(hit["entity"]["block_mongo_id"]) + _id = hit["entity"]["block_mongo_id"] + except: + logger.warning(f"update: EmbeddingHandler - Milvus - search - unexpected - " + f"could not convert to number - {hit}") + _id = -1 + else: + _id = hit.entity.get('block_mongo_id') block_result_list = self.utils.lookup_text_index(_id) for block in block_result_list: - block_list.append((block, hit.distance)) + + if self.use_milvus_lite: + distance = hit["distance"] + else: + distance = hit.distance + + block_list.append((block, distance)) """ try: @@ -576,10 +642,16 @@ def search_index(self, query_embedding_vector, sample_count=10): def delete_index(self): - collection = Collection(self.collection_name) - collection.release() - utility.drop_collection(self.collection_name) - connections.disconnect(self.milvus_alias) + if not self.use_milvus_lite: + + collection = pymilvus.Collection(self.collection_name) + collection.release() + pymilvus.utility.drop_collection(self.collection_name) + pymilvus.connections.disconnect(self.milvus_alias) + + else: + # delete + res = self.collection.delete(collection_name=self.collection_name) # Synchronize and remove embedding flag from collection db self.utils.unset_text_index() @@ -588,6 +660,7 @@ def delete_index(self): class EmbeddingFAISS: + """Implements the vector database FAISS. ``EmbeddingFAISS`` implements the interface to the ``FAISS`` vector database. It is used by the @@ -612,8 +685,25 @@ class EmbeddingFAISS: embedding_faiss : EmbeddingFAISS A new ``EmbeddingFAISS`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): + global GLOBAL_FAISS_IMPORT + if not GLOBAL_FAISS_IMPORT: + if util.find_spec("faiss"): + + try: + global faiss + faiss = importlib.import_module("faiss") + GLOBAL_FAISS_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load faiss module.") + + else: + raise LLMWareException(message="Exception: need to import faiss to use this class.") + + # end dynamic import here + self.library = library self.library_name = library.library_name self.account_name = library.account_name @@ -715,7 +805,7 @@ def create_new_embedding(self, doc_ids=None, batch_size=100): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add options to display/hide - print (f"update: embedding_handler - FAISS - Embeddings Created: {embeddings_created} of {num_of_blocks}") + logger.info(f"update: embedding_handler - FAISS - Embeddings Created: {embeddings_created} of {num_of_blocks}") # Ensure any existing file is removed before saving if os.path.exists(self.embedding_file_path): @@ -725,7 +815,7 @@ def create_new_embedding(self, doc_ids=None, batch_size=100): embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - FAISS - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - FAISS - embedding_summary - {embedding_summary}") return embedding_summary @@ -768,9 +858,10 @@ def delete_index(self): return 1 class EmbeddingLanceDB: + """Implements the vector database LanceDB. - ``EmbeddingLancDB`` implements the interface to the ``LanceDB`` vector database. It is used by the + ``EmbeddingLanceDB`` implements the interface to the ``LanceDB`` vector database. It is used by the ``EmbeddingHandler``. Parameters @@ -792,56 +883,77 @@ class EmbeddingLanceDB: embedding_lancedb : EmbeddingLanceDB A new ``EmbeddingLanceDB`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): - self.uri = LanceDBConfig().get_config("uri") - self.library = library - self.library_name = self.library.library_name - self.account_name = self.library.account_name - # look up model card - if not model and not model_name: - raise EmbeddingModelNotFoundException("no-model-or-model-name-provided") + # confirm that lancedb installed - self.model = model - self.model_name = model_name - self.embedding_dims = embedding_dims + global GLOBAL_LANCEDB_IMPORT + if not GLOBAL_LANCEDB_IMPORT: + if util.find_spec("lancedb"): - # if model passed (not None), then use model name - if self.model: - self.model_name = self.model.model_name - self.embedding_dims = model.embedding_dims + try: + global lancedb + lancedb = importlib.import_module("lancedb") + GLOBAL_LANCEDB_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load lancedb module.") - # initialize LanceDB - self.index = None + else: + raise LLMWareException(message="Exception: need to import lancedb to use this class.") - # initiate connection to LanceDB locally - try: - self.db = lancedb.connect(self.uri) - except: - raise ImportError( - "Exception - could not connect to LanceDB - please check:" - "1. LanceDB python package is installed, e.g,. 'pip install lancedb', and" - "2. The uri is properly set.") - self.utils = _EmbeddingUtils(library_name=self.library_name, - model_name=self.model_name, - account_name=self.account_name, - db_name="lancedb", - embedding_dims=self.embedding_dims) + # end dynamic import here - self.collection_name = self.utils.create_safe_collection_name() - self.collection_key = self.utils.create_db_specific_key() + self.uri = LanceDBConfig().get_config("uri") + self.library = library + self.library_name = self.library.library_name + self.account_name = self.library.account_name - # build new name here - # self.index_name = self.collection_name + # look up model card + if not model and not model_name: + raise EmbeddingModelNotFoundException("no-model-or-model-name-provided") - if self.collection_name not in self.db.table_names(): - self.index = self._init_table(self.collection_name) - # you don't need to create an index with lanceDB upto million vectors is efficiently supported with peak performance, - # Creating an index will fasten the search process and it needs to be done once table has some vectors already. + self.model = model + self.model_name = model_name + self.embedding_dims = embedding_dims + + # if model passed (not None), then use model name + if self.model: + self.model_name = self.model.model_name + self.embedding_dims = model.embedding_dims + + # initialize LanceDB + self.index = None + + # initiate connection to LanceDB locally + try: + self.db = lancedb.connect(self.uri) + except: + raise ImportError( + "Exception - could not connect to LanceDB - please check:" + "1. LanceDB python package is installed, e.g,. 'pip install lancedb', and" + "2. The uri is properly set.") + + self.utils = _EmbeddingUtils(library_name=self.library_name, + model_name=self.model_name, + account_name=self.account_name, + db_name="lancedb", + embedding_dims=self.embedding_dims) + + self.collection_name = self.utils.create_safe_collection_name() + self.collection_key = self.utils.create_db_specific_key() + + # build new name here + # self.index_name = self.collection_name + + if self.collection_name not in self.db.table_names(): + self.index = self._init_table(self.collection_name) + # you don't need to create an index with lanceDB upto million vectors is efficiently supported with peak performance, + # Creating an index will fasten the search process and it needs to be done once table has some vectors already. + + # connect to table + self.index = self.db.open_table(self.collection_name) - # connect to table - self.index = self.db.open_table(self.collection_name) - def _init_table(self,table_name): try: @@ -856,7 +968,6 @@ def _init_table(self,table_name): tbl = self.db.create_table(table_name, schema=schema, mode="overwrite") return tbl - def create_new_embedding(self, doc_ids = None, batch_size=500): all_blocks_cursor, num_of_blocks = self.utils.get_blocks_cursor(doc_ids=doc_ids) @@ -901,9 +1012,8 @@ def create_new_embedding(self, doc_ids = None, batch_size=500): vectors_ingest = [{ 'id' : block_id,'vector': vector.tolist()} for block_id,vector in zip(block_ids,vectors)] self.index.add(vectors_ingest) except Exception as e : - print(self.index) - print('schema',self.index.schema) - raise e + raise LLMWareException(message=f"Exception: LanceDB - {e} - {self.index} - schema - " + f"{self.index.schema}") current_index = self.utils.update_text_index(block_ids,current_index) @@ -911,11 +1021,12 @@ def create_new_embedding(self, doc_ids = None, batch_size=500): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add options to configure to show/hide - print (f"update: embedding_handler - Lancedb - Embeddings Created: {embeddings_created} of {num_of_blocks}") + logger.info (f"update: embedding_handler - Lancedb - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - Lancedb - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - Lancedb - embedding_summary - {embedding_summary}") return embedding_summary @@ -942,8 +1053,7 @@ def search_index(self, query_embedding_vector, sample_count=10): # block_list.append((block, match._distance)) except Exception as e: - print("result df cols" ,result.columns, type(result)) - raise e + raise LLMWareException(message=f"Exception: LanceDB - {e}") return block_list @@ -958,6 +1068,7 @@ def delete_index(self): class EmbeddingPinecone: + """Implements the vector database Pinecone. ``EmbeddingPinecone`` implements the interface to the ``Pinecone`` vector database. It is used by the @@ -982,6 +1093,7 @@ class EmbeddingPinecone: embedding_pinecone : EmbeddingPinecone A new ``EmbeddingPinecone`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): self.api_key = PineconeConfig().get_config("pinecone_api_key") @@ -1008,9 +1120,30 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): # initialize pinecone self.index = None + global GLOBAL_PINECONE_IMPORT + if not GLOBAL_PINECONE_IMPORT: + if util.find_spec("pinecone"): + + try: + global pinecone + pinecone = importlib.import_module("pinecone") + GLOBAL_PINECONE_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load pinecone module.") + + else: + raise LLMWareException(message="Exception: need to import pinecone to use this class.") + + """ + try: + from pinecone import Pinecone, ServerlessSpec + except ImportError: + pass + """ + # initiate connection to Pinecone try: - pinecone = Pinecone(api_key=self.api_key) + pinecone_client = pinecone.Pinecone(api_key=self.api_key) except: raise ImportError( "Exception - could not connect to Pinecone - please check:" @@ -1033,23 +1166,24 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): # build new name here # self.index_name = self.collection_name - pinecone_indexes = [pincone_index['name'] for pincone_index in pinecone.list_indexes()] + pinecone_indexes = [pincone_index['name'] for pincone_index in pinecone_client.list_indexes()] if self.collection_name not in pinecone_indexes: - pinecone.create_index( + pinecone_client.create_index( name=self.collection_name, dimension=self.embedding_dims, metric="euclidean", - spec=ServerlessSpec( + spec=pinecone.ServerlessSpec( cloud=self.cloud, region=self.region)) - pinecone.describe_index(self.collection_name) # Waits for index to be created + pinecone_client.describe_index(self.collection_name) # Waits for index to be created # describe_index_stats() # Returns: {'dimension': 8, 'index_fullness': 0.0, 'namespaces': {'': {'vector_count': 5}}} # connect to index - self.index = pinecone.Index(self.collection_name) + self.index = pinecone_client.Index(self.collection_name) def create_new_embedding(self, doc_ids = None, batch_size=100): + def chunks(iterable, batch_size=100): """A helper function to break an iterable into chunks of size batch_size.""" it = iter(iterable) @@ -1058,7 +1192,6 @@ def chunks(iterable, batch_size=100): yield chunk chunk = tuple(itertools.islice(it, batch_size)) - all_blocks_cursor, num_of_blocks = self.utils.get_blocks_cursor(doc_ids=doc_ids) # Initialize a new status @@ -1111,11 +1244,12 @@ def chunks(iterable, batch_size=100): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add options to configure to show/hide - print (f"update: embedding_handler - Pinecone - Embeddings Created: {embeddings_created} of {num_of_blocks}") + logger.info (f"update: embedding_handler - Pinecone - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - Pinecone - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - Pinecone - embedding_summary - {embedding_summary}") return embedding_summary @@ -1169,6 +1303,7 @@ class EmbeddingMongoAtlas: embedding_mongoatlas : EmbeddingMongoAtlas A new ``EmbeddingMongoAtlas`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): # Use a specified Mongo Atlas connection string if supplied. @@ -1208,7 +1343,26 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): # self.index_name = self.collection_name # Connect and create a MongoClient - self.mongo_client = MongoClient(self.connection_uri) + # confirm that qdrant installed + + global GLOBAL_PYMONGO_IMPORT + if not GLOBAL_PYMONGO_IMPORT: + if util.find_spec("pymongo"): + + try: + global pymongo + pymongo = importlib.import_module("pymongo") + GLOBAL_PYMONGO_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load pymongo module.") + + else: + raise LLMWareException(message="Exception: need to import pymongo to use this class.") + + # end dynamic import here + # from pymongo import MongoClient + + self.mongo_client = pymongo.MongoClient(self.connection_uri) # Make sure the Database exists by creating a dummy metadata collection self.embedding_db_name = "llmware_embeddings" @@ -1302,28 +1456,33 @@ def create_new_embedding(self, doc_ids = None, batch_size=500): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add configuration options to hide/show - print (f"update: embedding_handler - Mongo Atlas - Embeddings Created: {embeddings_created} of {num_of_blocks}") + logger.info(f"update: embedding_handler - Mongo Atlas - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") last_block_id = block_ids[-1] if embeddings_created > 0: - print(f"Embedding(Mongo Atlas): Waiting for {self.embedding_db_name}.{self.collection_name} to be ready for vector search...") + logger.info(f"Embedding(Mongo Atlas): Waiting for {self.embedding_db_name}.{self.collection_name} " + f"to be ready for vector search...") start_time = time.time() self.wait_for_search_index(last_block_id, start_time) wait_time = time.time() - start_time - print(f"Embedding(Mongo Atlas): {self.embedding_db_name}.{self.collection_name} ready ({wait_time: .2f} seconds)") + logger.info(f"Embedding(Mongo Atlas): {self.embedding_db_name}.{self.collection_name} " + f"ready ({wait_time: .2f} seconds)") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - Mongo Atlas - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - Mongo Atlas - embedding_summary - {embedding_summary}") return embedding_summary - # After doc insertion we want to make sure the index is ready before proceeding def wait_for_search_index(self, last_block_id, start_time): + + """ After doc insertion, we want to make sure the index is ready before proceeding ... """ + # If we've been waiting for 5 mins, then time out and just return if time.time() > start_time + (5 * 60): return @@ -1396,6 +1555,7 @@ def delete_index(self, index_name): class EmbeddingRedis: + """Implements the use of Redis as a vector database. ``EmbeddingRedis`` implements the interface to ``Redis``. It is used by the @@ -1420,6 +1580,7 @@ class EmbeddingRedis: embedding_redis : EmbeddingRedis A new ``EmbeddingRedis`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): self.library = library @@ -1430,6 +1591,34 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): redis_host = RedisConfig().get_config("host") redis_port = RedisConfig().get_config("port") + """ + try: + # import redis + from redis.commands.search.field import TagField, TextField, NumericField, VectorField + from redis.commands.search.indexDefinition import IndexDefinition, IndexType + from redis.commands.search.query import Query + except ImportError: + pass + """ + + # confirm that redis installed + + global GLOBAL_REDIS_IMPORT + if not GLOBAL_REDIS_IMPORT: + if util.find_spec("redis"): + + try: + global redis + redis= importlib.import_module("redis") + GLOBAL_REDIS_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load redis module.") + + else: + raise LLMWareException(message="Exception: need to import redis to use this class.") + + # end dynamic import here + self.r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True) # look up model card @@ -1455,18 +1644,19 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): try: # check to see if index exists self.r.ft(self.collection_name).info() - logging.info("update: embedding_handler - Redis - index already exists - %s", self.collection_name) + logger.info("update: embedding_handler - Redis - index already exists - %s", self.collection_name) except: + from redis.commands.search import field # schema schema = ( - NumericField("id"), - TextField("text"), - TextField("block_mongo_id"), - NumericField("block_id"), - NumericField("block_doc_id"), - VectorField("vector", # Vector Field Name + field.NumericField("id"), + field.TextField("text"), + field.TextField("block_mongo_id"), + field.NumericField("block_id"), + field.NumericField("block_doc_id"), + field.VectorField("vector", # Vector Field Name "FLAT", { # Vector Index Type: FLAT or HNSW "TYPE": "FLOAT32", # FLOAT32 or FLOAT64 "DIM": self.embedding_dims, @@ -1476,12 +1666,14 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): ) # index Definition - definition = IndexDefinition(prefix=[self.DOC_PREFIX], index_type=IndexType.HASH) + from redis.commands.search.indexDefinition import IndexDefinition, IndexType + definition = IndexDefinition(prefix=[self.DOC_PREFIX], + index_type=IndexType.HASH) # create Index self.r.ft(self.collection_name).create_index(fields=schema, definition=definition) - logging.info("update: embedding_handler - Redis - creating new index - %s ", self.collection_name) + logger.info("update: embedding_handler - Redis - creating new index - %s ", self.collection_name) def create_new_embedding(self, doc_ids=None, batch_size=500): @@ -1559,11 +1751,12 @@ def create_new_embedding(self, doc_ids=None, batch_size=500): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add configuration options to show/display - print(f"update: embedding_handler - Redis - Embeddings Created: {embeddings_created} of {num_of_blocks}") + logger.info(f"update: embedding_handler - Redis - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - Redis - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - Redis - embedding_summary - {embedding_summary}") return embedding_summary @@ -1572,7 +1765,7 @@ def search_index(self, query_embedding_vector, sample_count=10): query_embedding_vector = np.array(query_embedding_vector) query = ( - Query(f"*=>[KNN {sample_count} @vector $vec as score]") + redis.commands.search.query.Query(f"*=>[KNN {sample_count} @vector $vec as score]") .sort_by("score") .return_fields("score", "block_mongo_id", "block_doc_id", "block_id","text") .paging(0, sample_count) @@ -1588,8 +1781,6 @@ def search_index(self, query_embedding_vector, sample_count=10): block_list = [] for j, res in enumerate(results): - # print("results: ", j, res) - _id = str(res["block_mongo_id"]) score = float(res["score"]) @@ -1612,6 +1803,7 @@ def delete_index(self): class EmbeddingQdrant: + """Implements the Qdrant vector database. ``EmbeddingQdrant`` implements the interface to ``Qdrant``. It is used by the @@ -1636,13 +1828,40 @@ class EmbeddingQdrant: embedding_qdrant : EmbeddingQdrant A new ``EmbeddingQdrant`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): self.library = library self.library_name = library.library_name self.account_name = library.account_name - self.qclient = QdrantClient(**QdrantConfig.get_config()) + # confirm that qdrant installed + + global GLOBAL_QDRANT_IMPORT + if not GLOBAL_QDRANT_IMPORT: + if util.find_spec("qdrant_client"): + + try: + global qdrant_client + qdrant_client = importlib.import_module("qdrant_client") + GLOBAL_QDRANT_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load qdrant_client module.") + + else: + raise LLMWareException(message="Exception: need to import qdrant_client to use this class.") + + # end dynamic import here + + """ + try: + from qdrant_client import QdrantClient + from qdrant_client.http.models import Distance, VectorParams, PointStruct + except ImportError: + pass + """ + + self.qclient = qdrant_client.QdrantClient(**QdrantConfig.get_config()) # look up model card self.model = model @@ -1677,9 +1896,10 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): self.collection = ( self.qclient.create_collection( collection_name=self.collection_name, - vectors_config=VectorParams(size=self.embedding_dims, distance=Distance.DOT), )) + vectors_config=qdrant_client.http.models.VectorParams(size=self.embedding_dims, + distance=qdrant_client.http.models.Distance.DOT), )) - logging.info("update: embedding_handler - QDRANT - creating new collection - %s", + logger.info("update: embedding_handler - QDRANT - creating new collection - %s", self.collection_name) else: @@ -1731,15 +1951,15 @@ def create_new_embedding(self, doc_ids=None, batch_size=500): for i, embedding in enumerate(vectors): point_id = str(uuid.uuid4()) - ps = PointStruct(id=point_id, vector=embedding, - payload={"block_doc_id": doc_ids[i], "sentences": sentences[i], - "block_mongo_id": block_ids[i]}) + ps = qdrant_client.http.models.PointStruct(id=point_id, vector=embedding, + payload={"block_doc_id": doc_ids[i], + "sentences": sentences[i], + "block_mongo_id": block_ids[i]}) points_batch.append(ps) # upsert a batch of points - self.qclient.upsert(collection_name=self.collection_name, wait=True, - points=points_batch) + self.qclient.upsert(collection_name=self.collection_name, wait=True, points=points_batch) points_batch = [] @@ -1752,12 +1972,12 @@ def create_new_embedding(self, doc_ids=None, batch_size=500): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add configuration options to show/display - print( - f"update: embedding_handler - Qdrant - Embeddings Created: {embeddings_created} of {num_of_blocks}") + logger.info(f"update: embedding_handler - Qdrant - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info("update: EmbeddingHandler - Qdrant - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - Qdrant - embedding_summary - {embedding_summary}") return embedding_summary @@ -1769,8 +1989,6 @@ def search_index(self, query_embedding_vector, sample_count=10): block_list = [] for j, res in enumerate(search_results): - # print("results: ", j, res) - _id = res.payload["block_mongo_id"] block_result_list = self.utils.lookup_text_index(_id) @@ -1792,6 +2010,7 @@ def delete_index(self): class EmbeddingPGVector: + """Implements the interface to the PGVector vector database. ``EmbeddingPGVector`` implements the interface to ``PGVector``. It is used by the @@ -1816,6 +2035,7 @@ class EmbeddingPGVector: embedding_pgvector : EmbeddingPGVector A new ``EmbeddingPGVector`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None, full_schema=False): self.library = library @@ -1858,7 +2078,43 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None, fu # --note: in future releases, we will be building out more support for PostGres # self.full_schema = full_schema - # Session connection + # fist check for core postgres driver, and load if not present + global GLOBAL_PSYCOPG_IMPORT + if not GLOBAL_PSYCOPG_IMPORT: + if util.find_spec("psycopg"): + + try: + global psycopg + psycopg = importlib.import_module("psycopg") + GLOBAL_PSYCOPG_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load psycopg module.") + + else: + raise LLMWareException(message="Exception: need to import psycopg to use this class.") + + # second check for pg_vector specific driver and load if not present + global GLOBAL_PGVECTOR_IMPORT + if not GLOBAL_PGVECTOR_IMPORT: + if util.find_spec("pgvector"): + + try: + global pgvector + pgvector = importlib.import_module("pgvector") + GLOBAL_PGVECTOR_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load pgvector module.") + + else: + raise LLMWareException(message="Exception: need to import neo4j to use this class.") + + """ + try: + from pgvector.psycopg import register_vector + import psycopg + except ImportError: + pass + """ # note: for initial connection, need to confirm that the database exists self.conn = psycopg.connect(host=postgres_host, port=postgres_port, dbname=postgres_db_name, @@ -1866,6 +2122,7 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None, fu # register vector extension self.conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + from pgvector.psycopg import register_vector register_vector(self.conn) if not self.full_schema: @@ -2033,13 +2290,13 @@ def create_new_embedding(self, doc_ids=None, batch_size=500): status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) # will add configuration options to show/display - print(f"update: embedding_handler - PGVector - Embeddings Created: " - f"{embeddings_created} of {num_of_blocks}") + logger.info(f"update: embedding_handler - PGVector - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) embedded_blocks = embedding_summary["embedded_blocks"] - logging.info("update: EmbeddingHandler - PG_Vector - embedding_summary - %s", embedding_summary) + logger.info(f"update: EmbeddingHandler - PG_Vector - embedding_summary - {embedding_summary}") # safety check on output if not isinstance(embedded_blocks, int): @@ -2111,7 +2368,7 @@ def delete_index(self, collection_name=None): cursor = self.conn.cursor() cursor.execute(drop_command) - logging.info("update: embedding_handler - PG Vector - table dropped - %s", self.collection_name) + logger.info("update: embedding_handler - PG Vector - table dropped - %s", self.collection_name) # Commit your changes in the database self.conn.commit() @@ -2126,6 +2383,7 @@ def delete_index(self, collection_name=None): class EmbeddingNeo4j: + """Implements the interface to Neo4j as a vector database. ``EmbeddingNeo4j`` implements the interface to ``Neo4j``. It is used by the @@ -2150,13 +2408,13 @@ class EmbeddingNeo4j: embedding_Neo4j : EmbeddingNeo4j A new ``EmbeddingNeo4j`` object. """ + def __init__(self, library, model=None, model_name=None, embedding_dims=None): # look up model card if not model and not model_name: raise EmbeddingModelNotFoundException("no-model-or-model-name-provided") - self.library = library self.library_name = library.library_name self.model = model @@ -2177,12 +2435,36 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): password = Neo4jConfig.get_config('password') database = Neo4jConfig.get_config('database') + global GLOBAL_NEO4J_IMPORT + if not GLOBAL_NEO4J_IMPORT: + if util.find_spec("neo4j"): + + try: + global neo4j + neo4j = importlib.import_module("neo4j") + GLOBAL_NEO4J_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load neo4j module.") + + else: + raise LLMWareException(message="Exception: need to import neo4j to use this class.") + + # end dynamic import here + + # optional import of neo4j - not in project requirements + """ + try: + import neo4j + from neo4j import GraphDatabase + except: + pass + """ # Connect to Neo4J and verify connection. # Code taken from the code below # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/neo4j_vector.py#L165C9-L177C14 try: - self.driver = GraphDatabase.driver(uri, auth=(user, password)) + self.driver = neo4j.GraphDatabase.driver(uri, auth=(user, password)) self.driver.verify_connectivity() except neo4j.exceptions.ServiceUnavailable: raise ValueError( @@ -2306,11 +2588,11 @@ def create_new_embedding(self, doc_ids=None, batch_size=500): embeddings_created += len(sentences) status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) - print(f"update: embedding_handler - Neo4j - Embeddings Created: {embeddings_created} of {num_of_blocks}") - + logger.info(f"update: embedding_handler - Neo4j - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info(f'update: EmbeddingHandler - Neo4j - embedding_summary - {embedding_summary}') + logger.info(f'update: EmbeddingHandler - Neo4j - embedding_summary - {embedding_summary}') return embedding_summary @@ -2334,15 +2616,17 @@ def search_index(self, query_embedding_vector, sample_count=10): return block_list def delete_index(self, index_name): + try: self._query(f"DROP INDEX $index_name", {'index_name': index_name}) - except DatabaseError: # Index did not exist yet + except neo4j.DatabaseError: # Index did not exist yet pass self.utils.unset_text_index() def _query(self, query, parameters=None): - from neo4j.exceptions import CypherSyntaxError + + # from neo4j.exceptions import CypherSyntaxError parameters = parameters or {} @@ -2350,7 +2634,7 @@ def _query(self, query, parameters=None): try: data = session.run(query, parameters) return [d.data() for d in data] - except CypherSyntaxError as e: + except neo4j.exceptions.CypherSyntaxError as e: raise ValueError(f'Cypher Statement is not valid\n{e}') @@ -2386,6 +2670,23 @@ def __init__(self, library, model=None, model_name=None, embedding_dims=None): # # General llmware set up code # + # confirm that pymilvus installed + + global GLOBAL_CHROMADB_IMPORT + if not GLOBAL_CHROMADB_IMPORT: + if util.find_spec("chromadb"): + + try: + global chromadb + chromadb = importlib.import_module("chromadb") + GLOBAL_CHROMADB_IMPORT = True + except: + raise LLMWareException(message="Exception: could not load chromadb module.") + + else: + raise LLMWareException(message="Exception: need to import chromadb to use this class.") + + # end dynamic import here # look up model card if not model and not model_name: @@ -2489,18 +2790,17 @@ def create_new_embedding(self, doc_ids=None, batch_size=500): embeddings=vectors, metadatas=metadatas) - current_index = self.utils.update_text_index(block_ids, current_index) # Update statistics embeddings_created += len(sentences) status.increment_embedding_status(self.library.library_name, self.model_name, len(sentences)) - print(f"update: embedding_handler - ChromaDB - Embeddings Created: {embeddings_created} of {num_of_blocks}") - + logger.info(f"update: embedding_handler - ChromaDB - Embeddings Created: " + f"{embeddings_created} of {num_of_blocks}") embedding_summary = self.utils.generate_embedding_summary(embeddings_created) - logging.info(f'update: EmbeddingHandler - ChromaDB - embedding_summary - {embedding_summary}') + logger.info(f'update: EmbeddingHandler - ChromaDB - embedding_summary - {embedding_summary}') return embedding_summary From 7aa9898b3823f94c85f19baf9b462568b423773c Mon Sep 17 00:00:00 2001 From: Darren Oberst <41238031+doberst@users.noreply.github.com> Date: Thu, 30 May 2024 16:40:02 -0400 Subject: [PATCH 47/48] Update using_milvus_lite.py --- examples/Embedding/using_milvus_lite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/Embedding/using_milvus_lite.py b/examples/Embedding/using_milvus_lite.py index b7613630..4a4e718a 100644 --- a/examples/Embedding/using_milvus_lite.py +++ b/examples/Embedding/using_milvus_lite.py @@ -1,10 +1,10 @@ -"""" This example is a fast start with Milvus Lite, which is a 'no-install' file-based version of Milvus, intended +""" This example is a fast start with Milvus Lite, which is a 'no-install' file-based version of Milvus, intended for rapid prototyping. A couple of key points to note: -- Platform - per Milvus docs, Milvus Lite is designed for Mac and Linux (not on Windows currently) -- PyMilvus - need to `pip install pymilvus>=2.4.2` - -- within LLMWare: set MilvusConfig("lite", True) + -- within LLMWare: set MilvusConfig().set_config("lite", True) """ import os From b371082e47916565de7bc67bfb6405b4b642c5a6 Mon Sep 17 00:00:00 2001 From: Darren Oberst <41238031+doberst@users.noreply.github.com> Date: Thu, 30 May 2024 16:43:53 -0400 Subject: [PATCH 48/48] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 63fdc118..721001e4 100644 --- a/README.md +++ b/README.md @@ -805,6 +805,7 @@ Questions and discussions are welcome in our [github discussions](https://github See also [additional deployment/install release notes in wheel_archives](https://github.com/llmware-ai/llmware/tree/main/wheel_archives) **Wednesday, May 29 - v0.3.0-WIP** +- Added support for new Milvus Lite embedded 'no-install' database - see [example](https://github.com/llmware-ai/llmware/tree/main/examples/Embedding/using_milvus_lite.py). - Added two new SLIM models to catalog and agent processes - ['q-gen'](https://github.com/llmware-ai/llmware/tree/main/examples/SLIM-Agents/using-slim-q-gen.py) and ['qa-gen'](https://github.com/llmware-ai/llmware/tree/main/examples/SLIM-Agents/using-slim-qa-gen.py) - Updated model class instantiation to provide more extensibility to add new classes in different modules - Planning to remove torch and transformers from pip install package