From d6e6f5c0776276c82c9ec6a05a884bfdf15509cb Mon Sep 17 00:00:00 2001 From: Josh Bradley Date: Thu, 24 Oct 2024 14:22:32 -0400 Subject: [PATCH] Convert CLI to Typer app (#1305) --- .../patch-20241017135754184606.json | 4 + .vscode/launch.json | 4 +- docs/config/init.md | 9 +- docs/get_started.md | 22 +- docs/index/cli.md | 8 +- docs/prompt_tuning/auto_prompt_tuning.md | 8 +- docs/query/cli.md | 8 +- graphrag/__main__.py | 8 + graphrag/api/prompt_tune.py | 4 +- graphrag/cli/__init__.py | 4 + graphrag/{index/cli.py => cli/index.py} | 100 ++---- graphrag/cli/initialize.py | 67 ++++ graphrag/cli/main.py | 308 ++++++++++++++++++ .../cli.py => cli/prompt_tune.py} | 33 +- graphrag/{query/cli.py => cli/query.py} | 26 +- graphrag/config/load_config.py | 8 +- graphrag/index/__main__.py | 118 ------- graphrag/index/run/run.py | 3 +- graphrag/logging/types.py | 2 +- graphrag/prompt_tune/__main__.py | 118 ------- graphrag/prompt_tune/types.py | 2 +- graphrag/query/__main__.py | 103 ------ graphrag/utils/storage.py | 4 +- poetry.lock | 32 +- pyproject.toml | 12 +- tests/fixtures/min-csv/config.json | 16 +- tests/fixtures/text/config.json | 16 +- tests/notebook/test_notebooks.py | 2 +- tests/smoke/test_fixtures.py | 7 +- 29 files changed, 541 insertions(+), 515 deletions(-) create mode 100644 .semversioner/next-release/patch-20241017135754184606.json create mode 100644 graphrag/__main__.py create mode 100644 graphrag/cli/__init__.py rename graphrag/{index/cli.py => cli/index.py} (52%) create mode 100644 graphrag/cli/initialize.py create mode 100644 graphrag/cli/main.py rename graphrag/{prompt_tune/cli.py => cli/prompt_tune.py} (78%) rename graphrag/{query/cli.py => cli/query.py} (94%) delete mode 100644 graphrag/index/__main__.py delete mode 100644 graphrag/prompt_tune/__main__.py delete mode 100644 graphrag/query/__main__.py diff --git a/.semversioner/next-release/patch-20241017135754184606.json b/.semversioner/next-release/patch-20241017135754184606.json new file mode 100644 index 0000000000..477ecc1a36 --- /dev/null +++ b/.semversioner/next-release/patch-20241017135754184606.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "reorganize cli layer" +} diff --git a/.vscode/launch.json b/.vscode/launch.json index 5d8bec3194..2167063966 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -21,7 +21,7 @@ "poe", "query", "--root", "", "--method", "global", - "What are the top themes in this story", + "--query", "What are the top themes in this story", ] }, { @@ -30,7 +30,7 @@ "request": "launch", "module": "poetry", "args": [ - "poe", "prompt_tune", + "poe", "prompt-tune", "--config", "/settings.yaml", ] diff --git a/docs/config/init.md b/docs/config/init.md index ba92e3b10f..c3682bde92 100644 --- a/docs/config/init.md +++ b/docs/config/init.md @@ -1,22 +1,21 @@ # Configuring GraphRAG Indexing -To start using GraphRAG, you need to configure the system. The `init` command is the easiest way to get started. It will create a `.env` and `settings.yaml` files in the specified directory with the necessary configuration settings. It will also output the default LLM prompts used by GraphRAG. +To start using GraphRAG, you must generate a configuration file. The `init` command is the easiest way to get started. It will create a `.env` and `settings.yaml` files in the specified directory with the necessary configuration settings. It will also output the default LLM prompts used by GraphRAG. ## Usage ```sh -python -m graphrag.index [--init] [--root PATH] +graphrag init [--root PATH] ``` ## Options -- `--init` - Initialize the directory with the necessary configuration files. -- `--root PATH` - The root directory to initialize. Default is the current directory. +- `--root PATH` - The project root directory to initialize graphrag at. Default is the current directory. ## Example ```sh -python -m graphrag.index --init --root ./ragtest +graphrag init --root ./ragtest ``` ## Output diff --git a/docs/get_started.md b/docs/get_started.md index 72aa10feae..3db4a82cc1 100644 --- a/docs/get_started.md +++ b/docs/get_started.md @@ -52,23 +52,23 @@ Next we'll inject some required config variables: First let's make sure to setup the required environment variables. For details on these environment variables, and what environment variables are available, see the [variables documentation](config/overview.md). -To initialize your workspace, let's first run the `graphrag.index --init` command. -Since we have already configured a directory named \.ragtest` in the previous step, we can run the following command: +To initialize your workspace, first run the `graphrag init` command. +Since we have already configured a directory named `./ragtest` in the previous step, run the following command: ```sh -python -m graphrag.index --init --root ./ragtest +graphrag init --root ./ragtest ``` This will create two files: `.env` and `settings.yaml` in the `./ragtest` directory. - `.env` contains the environment variables required to run the GraphRAG pipeline. If you inspect the file, you'll see a single environment variable defined, - `GRAPHRAG_API_KEY=`. This is the API key for the OpenAI API or Azure OpenAI endpoint. You can replace this with your own API key. + `GRAPHRAG_API_KEY=`. This is the API key for the OpenAI API or Azure OpenAI endpoint. You can replace this with your own API key. If you are using another form of authentication (i.e. managed identity), please delete this file. - `settings.yaml` contains the settings for the pipeline. You can modify this file to change the settings for the pipeline.
#### OpenAI and Azure OpenAI -To run in OpenAI mode, just make sure to update the value of `GRAPHRAG_API_KEY` in the `.env` file with your OpenAI API key. +If running in OpenAI mode, update the value of `GRAPHRAG_API_KEY` in the `.env` file with your OpenAI API key. #### Azure OpenAI @@ -90,13 +90,13 @@ deployment_name: Finally we'll run the pipeline! ```sh -python -m graphrag.index --root ./ragtest +graphrag index --root ./ragtest ``` ![pipeline executing from the CLI](img/pipeline-running.png) This process will take some time to run. This depends on the size of your input data, what model you're using, and the text chunk size being used (these can be configured in your `settings.yml` file). -Once the pipeline is complete, you should see a new folder called `./ragtest/output//artifacts` with a series of parquet files. +Once the pipeline is complete, you should see a new folder called `./ragtest/output` with a series of parquet files. # Using the Query Engine @@ -107,19 +107,19 @@ Now let's ask some questions using this dataset. Here is an example using Global search to ask a high-level question: ```sh -python -m graphrag.query \ +graphrag query \ --root ./ragtest \ --method global \ -"What are the top themes in this story?" +--query "What are the top themes in this story?" ``` Here is an example using Local search to ask a more specific question about a particular character: ```sh -python -m graphrag.query \ +graphrag query \ --root ./ragtest \ --method local \ -"Who is Scrooge, and what are his main relationships?" +--query "Who is Scrooge and what are his main relationships?" ``` Please refer to [Query Engine](query/overview.md) docs for detailed information about how to leverage our Local and Global search mechanisms for extracting meaningful insights from data after the Indexer has wrapped up execution. diff --git a/docs/index/cli.md b/docs/index/cli.md index 8ec581b7bb..9f479771ca 100644 --- a/docs/index/cli.md +++ b/docs/index/cli.md @@ -3,21 +3,21 @@ The GraphRAG indexer CLI allows for no-code usage of the GraphRAG Indexer. ```bash -python -m graphrag.index --verbose --root \ +graphrag index --verbose --root \ --config --resume \ --reporter --emit json,csv,parquet \ ---nocache +--no-cache ``` ## CLI Arguments - `--verbose` - Adds extra logging information during the run. - `--root ` - the data root directory. This should contain an `input` directory with the input data, and an `.env` file with environment variables. These are described below. -- `--init` - This will initialize the data project directory at the specified `root` with bootstrap configuration and prompt-overrides. - `--resume ` - if specified, the pipeline will attempt to resume a prior run. The parquet files from the prior run will be loaded into the system as inputs, and the workflows that generated those files will be skipped. The input value should be the timestamped output folder, e.g. "20240105-143721". - `--config ` - This will opt-out of the Default Configuration mode and execute a custom configuration. If this is used, then none of the environment-variables below will apply. - `--reporter ` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`. +- `--dry-run` - Runs the indexing pipeline without executing any steps in order to inspect and validate the configuration file. - `--emit ` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated. -- `--nocache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production. +- `--no-cache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production. - `--output ` - Specify the output directory for pipeline artifacts. - `--reports ` - Specify the output directory for reporting. diff --git a/docs/prompt_tuning/auto_prompt_tuning.md b/docs/prompt_tuning/auto_prompt_tuning.md index e0b39c88b5..6279fc5e39 100644 --- a/docs/prompt_tuning/auto_prompt_tuning.md +++ b/docs/prompt_tuning/auto_prompt_tuning.md @@ -13,14 +13,14 @@ Figure 1: Auto Tuning Conceptual Diagram. ## Prerequisites -Before running auto tuning make sure you have already initialized your workspace with the `graphrag.index --init` command. This will create the necessary configuration files and the default prompts. Refer to the [Init Documentation](../config/init.md) for more information about the initialization process. +Before running auto tuning, ensure you have already initialized your workspace with the `graphrag init` command. This will create the necessary configuration files and the default prompts. Refer to the [Init Documentation](../config/init.md) for more information about the initialization process. ## Usage You can run the main script from the command line with various options: ```bash -python -m graphrag.prompt_tune [--root ROOT] [--domain DOMAIN] [--method METHOD] [--limit LIMIT] [--language LANGUAGE] \ +graphrag prompt-tune [--root ROOT] [--domain DOMAIN] [--method METHOD] [--limit LIMIT] [--language LANGUAGE] \ [--max-tokens MAX_TOKENS] [--chunk-size CHUNK_SIZE] [--n-subset-max N_SUBSET_MAX] [--k K] \ [--min-examples-required MIN_EXAMPLES_REQUIRED] [--no-entity-types] [--output OUTPUT] ``` @@ -56,7 +56,7 @@ python -m graphrag.prompt_tune [--root ROOT] [--domain DOMAIN] [--method METHOD ## Example Usage ```bash -python -m graphrag.prompt_tune --root /path/to/project --config /path/to/settings.yaml --domain "environmental news" \ +python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --domain "environmental news" \ --method random --limit 10 --language English --max-tokens 2048 --chunk-size 256 --min-examples-required 3 \ --no-entity-types --output /path/to/output ``` @@ -64,7 +64,7 @@ python -m graphrag.prompt_tune --root /path/to/project --config /path/to/setting or, with minimal configuration (suggested): ```bash -python -m graphrag.prompt_tune --root /path/to/project --config /path/to/settings.yaml --no-entity-types +python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --no-entity-types ``` ## Document Selection Methods diff --git a/docs/query/cli.md b/docs/query/cli.md index a8ce493bb6..10d3a92e2d 100644 --- a/docs/query/cli.md +++ b/docs/query/cli.md @@ -3,15 +3,15 @@ The GraphRAG query CLI allows for no-code usage of the GraphRAG Query engine. ```bash -python -m graphrag.query --config --data --community_level --response_type --method <"local"|"global"> +graphrag query --config --data --community-level --response-type --method <"local"|"global"> ``` ## CLI Arguments - `--config ` - The configuration yaml file to use when running the query. If this is used, then none of the environment-variables below will apply. - `--data ` - Folder containing the `.parquet` output files from running the Indexer. -- `--community_level ` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2 -- `--response_type ` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`. +- `--community-level ` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2 +- `--response-type ` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`. - `--method <"local"|"global">` - Method to use to answer the query, one of local or global. For more information check [Overview](overview.md) - `--streaming` - Stream back the LLM response @@ -41,4 +41,4 @@ You can further customize the execution by providing these environment variables - `GRAPHRAG_GLOBAL_SEARCH_DATA_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000` - `GRAPHRAG_GLOBAL_SEARCH_MAP_MAX_TOKENS` - Default: `500` - `GRAPHRAG_GLOBAL_SEARCH_REDUCE_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500). Default: `2000` -- `GRAPHRAG_GLOBAL_SEARCH_CONCURRENCY` - Default: `32` \ No newline at end of file +- `GRAPHRAG_GLOBAL_SEARCH_CONCURRENCY` - Default: `32` diff --git a/graphrag/__main__.py b/graphrag/__main__.py new file mode 100644 index 0000000000..ae2421478c --- /dev/null +++ b/graphrag/__main__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""The GraphRAG package.""" + +from .cli.main import app + +app(prog_name="graphrag") diff --git a/graphrag/api/prompt_tune.py b/graphrag/api/prompt_tune.py index dfaaa91376..4363c71b10 100644 --- a/graphrag/api/prompt_tune.py +++ b/graphrag/api/prompt_tune.py @@ -47,7 +47,7 @@ async def generate_indexing_prompts( domain: str | None = None, language: str | None = None, max_tokens: int = MAX_TOKEN_COUNT, - skip_entity_types: bool = False, + discover_entity_types: bool = True, min_examples_required: PositiveInt = 2, n_subset_max: PositiveInt = 300, k: PositiveInt = 15, @@ -114,7 +114,7 @@ async def generate_indexing_prompts( ) entity_types = None - if not skip_entity_types: + if discover_entity_types: reporter.info("Generating entity types...") entity_types = await generate_entity_types( llm, diff --git a/graphrag/cli/__init__.py b/graphrag/cli/__init__.py new file mode 100644 index 0000000000..23017822da --- /dev/null +++ b/graphrag/cli/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""CLI for GraphRAG.""" diff --git a/graphrag/index/cli.py b/graphrag/cli/index.py similarity index 52% rename from graphrag/index/cli.py rename to graphrag/cli/index.py index d20166db9a..20c8bf1c75 100644 --- a/graphrag/index/cli.py +++ b/graphrag/cli/index.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""Main definition.""" +"""CLI implementation of index subcommand.""" import asyncio import logging @@ -17,17 +17,11 @@ load_config, resolve_paths, ) +from graphrag.index.emit.types import TableEmitterType +from graphrag.index.validate_config import validate_config_names from graphrag.logging import ProgressReporter, ReporterType, create_progress_reporter from graphrag.utils.cli import redact -from .emit.types import TableEmitterType -from .graph.extractors.claims.prompts import CLAIM_EXTRACTION_PROMPT -from .graph.extractors.community_reports.prompts import COMMUNITY_REPORT_PROMPT -from .graph.extractors.graph.prompts import GRAPH_EXTRACTION_PROMPT -from .graph.extractors.summarize.prompts import SUMMARIZE_PROMPT -from .init_content import INIT_DOTENV, INIT_YAML -from .validate_config import validate_config_names - # Ignore warnings from numba warnings.filterwarnings("ignore", message=".*NumbaDeprecationWarning.*") @@ -72,37 +66,32 @@ def handle_signal(signum, _): def index_cli( - root_dir: str, - init: bool, + root_dir: Path, verbose: bool, - resume: str, + resume: str | None, update_index_id: str | None, memprofile: bool, - nocache: bool, + cache: bool, reporter: ReporterType, - config_filepath: str | None, + config_filepath: Path | None, emit: list[TableEmitterType], - dryrun: bool, - skip_validations: bool, - output_dir: str | None, + dry_run: bool, + skip_validation: bool, + output_dir: Path | None, ): """Run the pipeline with the given config.""" progress_reporter = create_progress_reporter(reporter) info, error, success = _logger(progress_reporter) run_id = resume or update_index_id or time.strftime("%Y%m%d-%H%M%S") - if init: - _initialize_project_at(root_dir, progress_reporter) - sys.exit(0) - - root = Path(root_dir).resolve() - config = load_config(root, config_filepath) - - config.storage.base_dir = output_dir or config.storage.base_dir - config.reporting.base_dir = output_dir or config.reporting.base_dir + config = load_config(root_dir, config_filepath) + config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir + config.reporting.base_dir = ( + str(output_dir) if output_dir else config.reporting.base_dir + ) resolve_paths(config, run_id) - if nocache: + if not cache: config.cache.type = CacheType.none enabled_logging, log_path = enable_logging_with_config(config, verbose) @@ -114,16 +103,16 @@ def index_cli( True, ) - if skip_validations: + if skip_validation: validate_config_names(progress_reporter, config) - info(f"Starting pipeline run for: {run_id}, {dryrun=}", verbose) + info(f"Starting pipeline run for: {run_id}, {dry_run=}", verbose) info( f"Using default configuration: {redact(config.model_dump())}", verbose, ) - if dryrun: + if dry_run: info("Dry run complete, exiting...", True) sys.exit(0) @@ -153,54 +142,3 @@ def index_cli( success("All workflows completed successfully.", True) sys.exit(1 if encountered_errors else 0) - - -def _initialize_project_at(path: str, reporter: ProgressReporter) -> None: - """Initialize the project at the given path.""" - reporter.info(f"Initializing project at {path}") - root = Path(path) - if not root.exists(): - root.mkdir(parents=True, exist_ok=True) - - settings_yaml = root / "settings.yaml" - if settings_yaml.exists(): - msg = f"Project already initialized at {root}" - raise ValueError(msg) - - with settings_yaml.open("wb") as file: - file.write(INIT_YAML.encode(encoding="utf-8", errors="strict")) - - dotenv = root / ".env" - if not dotenv.exists(): - with dotenv.open("wb") as file: - file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict")) - - prompts_dir = root / "prompts" - if not prompts_dir.exists(): - prompts_dir.mkdir(parents=True, exist_ok=True) - - entity_extraction = prompts_dir / "entity_extraction.txt" - if not entity_extraction.exists(): - with entity_extraction.open("wb") as file: - file.write( - GRAPH_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict") - ) - - summarize_descriptions = prompts_dir / "summarize_descriptions.txt" - if not summarize_descriptions.exists(): - with summarize_descriptions.open("wb") as file: - file.write(SUMMARIZE_PROMPT.encode(encoding="utf-8", errors="strict")) - - claim_extraction = prompts_dir / "claim_extraction.txt" - if not claim_extraction.exists(): - with claim_extraction.open("wb") as file: - file.write( - CLAIM_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict") - ) - - community_report = prompts_dir / "community_report.txt" - if not community_report.exists(): - with community_report.open("wb") as file: - file.write( - COMMUNITY_REPORT_PROMPT.encode(encoding="utf-8", errors="strict") - ) diff --git a/graphrag/cli/initialize.py b/graphrag/cli/initialize.py new file mode 100644 index 0000000000..b861132597 --- /dev/null +++ b/graphrag/cli/initialize.py @@ -0,0 +1,67 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""CLI implementation of initialization subcommand.""" + +from pathlib import Path + +from graphrag.index.graph.extractors.claims.prompts import CLAIM_EXTRACTION_PROMPT +from graphrag.index.graph.extractors.community_reports.prompts import ( + COMMUNITY_REPORT_PROMPT, +) +from graphrag.index.graph.extractors.graph.prompts import GRAPH_EXTRACTION_PROMPT +from graphrag.index.graph.extractors.summarize.prompts import SUMMARIZE_PROMPT +from graphrag.index.init_content import INIT_DOTENV, INIT_YAML +from graphrag.logging import ReporterType, create_progress_reporter + + +def initialize_project_at(path: Path) -> None: + """Initialize the project at the given path.""" + progress_reporter = create_progress_reporter(ReporterType.RICH) + progress_reporter.info(f"Initializing project at {path}") + root = Path(path) + if not root.exists(): + root.mkdir(parents=True, exist_ok=True) + + settings_yaml = root / "settings.yaml" + if settings_yaml.exists(): + msg = f"Project already initialized at {root}" + raise ValueError(msg) + + with settings_yaml.open("wb") as file: + file.write(INIT_YAML.encode(encoding="utf-8", errors="strict")) + + dotenv = root / ".env" + if not dotenv.exists(): + with dotenv.open("wb") as file: + file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict")) + + prompts_dir = root / "prompts" + if not prompts_dir.exists(): + prompts_dir.mkdir(parents=True, exist_ok=True) + + entity_extraction = prompts_dir / "entity_extraction.txt" + if not entity_extraction.exists(): + with entity_extraction.open("wb") as file: + file.write( + GRAPH_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict") + ) + + summarize_descriptions = prompts_dir / "summarize_descriptions.txt" + if not summarize_descriptions.exists(): + with summarize_descriptions.open("wb") as file: + file.write(SUMMARIZE_PROMPT.encode(encoding="utf-8", errors="strict")) + + claim_extraction = prompts_dir / "claim_extraction.txt" + if not claim_extraction.exists(): + with claim_extraction.open("wb") as file: + file.write( + CLAIM_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict") + ) + + community_report = prompts_dir / "community_report.txt" + if not community_report.exists(): + with community_report.open("wb") as file: + file.write( + COMMUNITY_REPORT_PROMPT.encode(encoding="utf-8", errors="strict") + ) diff --git a/graphrag/cli/main.py b/graphrag/cli/main.py new file mode 100644 index 0000000000..aae537dad9 --- /dev/null +++ b/graphrag/cli/main.py @@ -0,0 +1,308 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""CLI entrypoint.""" + +import asyncio +from enum import Enum +from pathlib import Path +from typing import Annotated + +import typer + +from graphrag.api import DocSelectionType +from graphrag.index.emit.types import TableEmitterType +from graphrag.logging import ReporterType +from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT +from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE + +from .index import index_cli +from .initialize import initialize_project_at +from .prompt_tune import prompt_tune +from .query import run_global_search, run_local_search + +INVALID_METHOD_ERROR = "Invalid method" + +app = typer.Typer( + help="GraphRAG: A graph-based retrieval-augmented generation (RAG) system.", + no_args_is_help=True, +) + + +class SearchType(Enum): + """The type of search to run.""" + + LOCAL = "local" + GLOBAL = "global" + + def __str__(self): + """Return the string representation of the enum value.""" + return self.value + + +@app.command("init") +def _initialize_cli( + root: Annotated[ + Path, + typer.Option( + help="The project root directory.", + dir_okay=True, + writable=True, + resolve_path=True, + ), + ], +): + """Generate a default configuration file.""" + initialize_project_at(path=root) + + +@app.command("index") +def _index_cli( + config: Annotated[ + Path | None, + typer.Option( + help="The configuration to use.", exists=True, file_okay=True, readable=True + ), + ] = None, + root: Annotated[ + Path, + typer.Option( + help="The project root directory.", + exists=True, + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = Path(), # set default to current directory + verbose: Annotated[ + bool, typer.Option(help="Run the indexing pipeline with verbose logging") + ] = False, + memprofile: Annotated[ + bool, typer.Option(help="Run the indexing pipeline with memory profiling") + ] = False, + resume: Annotated[ + str | None, typer.Option(help="Resume a given indexing run") + ] = None, + reporter: Annotated[ + ReporterType, typer.Option(help="The progress reporter to use.") + ] = ReporterType.RICH, + emit: Annotated[ + str, typer.Option(help="The data formats to emit, comma-separated.") + ] = TableEmitterType.Parquet.value, + dry_run: Annotated[ + bool, + typer.Option( + help="Run the indexing pipeline without executing any steps to inspect and validate the configuration." + ), + ] = False, + cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True, + skip_validation: Annotated[ + bool, + typer.Option( + help="Skip any preflight validation. Useful when running no LLM steps." + ), + ] = False, + update_index: Annotated[ + str | None, + typer.Option( + help="Update an index run id, leveraging previous outputs and applying new indexes." + ), + ] = None, + output: Annotated[ + Path | None, + typer.Option( + help="Indexing pipeline output directory. Overrides storage.base_dir in the configuration file.", + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = None, +): + """Build a knowledge graph index.""" + if resume and update_index: + msg = "Cannot resume and update a run at the same time" + raise ValueError(msg) + + index_cli( + root_dir=root, + verbose=verbose, + resume=resume, + update_index_id=update_index, + memprofile=memprofile, + cache=cache, + reporter=ReporterType(reporter), + config_filepath=config, + emit=[TableEmitterType(value.strip()) for value in emit.split(",")], + dry_run=dry_run, + skip_validation=skip_validation, + output_dir=output, + ) + + +@app.command("prompt-tune") +def _prompt_tune_cli( + root: Annotated[ + Path, + typer.Option( + help="The project root directory.", + exists=True, + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = Path(), # set default to current directory + config: Annotated[ + Path | None, + typer.Option( + help="The configuration to use.", exists=True, file_okay=True, readable=True + ), + ] = None, + domain: Annotated[ + str | None, + typer.Option( + help="The domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If not defined, a domain will be inferred from the input data." + ), + ] = None, + selection_method: Annotated[ + DocSelectionType, typer.Option(help="The text chunk selection method.") + ] = DocSelectionType.RANDOM, + n_subset_max: Annotated[ + int, + typer.Option( + help="The number of text chunks to embed when --selection-method=auto." + ), + ] = 300, + k: Annotated[ + int, + typer.Option( + help="The maximum number of documents to select from each centroid when --selection-method=auto." + ), + ] = 15, + limit: Annotated[ + int, + typer.Option( + help="The number of documents to load when --selection-method={random,top}." + ), + ] = 15, + max_tokens: Annotated[ + int, typer.Option(help="The max token count for prompt generation.") + ] = MAX_TOKEN_COUNT, + min_examples_required: Annotated[ + int, + typer.Option( + help="The minimum number of examples to generate/include in the entity extraction prompt." + ), + ] = 2, + chunk_size: Annotated[ + int, typer.Option(help="The max token count for prompt generation.") + ] = MIN_CHUNK_SIZE, + language: Annotated[ + str | None, + typer.Option( + help="The primary language used for inputs and outputs in graphrag prompts." + ), + ] = None, + discover_entity_types: Annotated[ + bool, typer.Option(help="Discover and extract unspecified entity types.") + ] = True, + output: Annotated[ + Path, + typer.Option( + help="The directory to save prompts to, relative to the project root directory.", + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = Path("prompts"), +): + """Generate custom graphrag prompts with your own data (i.e. auto templating).""" + loop = asyncio.get_event_loop() + loop.run_until_complete( + prompt_tune( + root=root, + config=config, + domain=domain, + selection_method=selection_method, + limit=limit, + max_tokens=max_tokens, + chunk_size=chunk_size, + language=language, + discover_entity_types=discover_entity_types, + output=output, + n_subset_max=n_subset_max, + k=k, + min_examples_required=min_examples_required, + ) + ) + + +@app.command("query") +def _query_cli( + method: Annotated[SearchType, typer.Option(help="The query algorithm to use.")], + query: Annotated[str, typer.Option(help="The query to execute.")], + config: Annotated[ + Path | None, + typer.Option( + help="The configuration to use.", exists=True, file_okay=True, readable=True + ), + ] = None, + data: Annotated[ + Path | None, + typer.Option( + help="Indexing pipeline output directory (i.e. contains the parquet files).", + exists=True, + dir_okay=True, + readable=True, + resolve_path=True, + ), + ] = None, + root: Annotated[ + Path, + typer.Option( + help="The project root directory.", + exists=True, + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = Path(), # set default to current directory + community_level: Annotated[ + int, + typer.Option( + help="The community level in the Leiden community hierarchy from which to load community reports. Higher values represent reports from smaller communities." + ), + ] = 2, + response_type: Annotated[ + str, + typer.Option( + help="Free form text describing the response type and format, can be anything, e.g. Multiple Paragraphs, Single Paragraph, Single Sentence, List of 3-7 Points, Single Page, Multi-Page Report. Default: Multiple Paragraphs" + ), + ] = "Multiple Paragraphs", + streaming: Annotated[ + bool, typer.Option(help="Print response in a streaming manner.") + ] = False, +): + """Query a knowledge graph index.""" + match method: + case SearchType.LOCAL: + run_local_search( + config_filepath=config, + data_dir=data, + root_dir=root, + community_level=community_level, + response_type=response_type, + streaming=streaming, + query=query, + ) + case SearchType.GLOBAL: + run_global_search( + config_filepath=config, + data_dir=data, + root_dir=root, + community_level=community_level, + response_type=response_type, + streaming=streaming, + query=query, + ) + case _: + raise ValueError(INVALID_METHOD_ERROR) diff --git a/graphrag/prompt_tune/cli.py b/graphrag/cli/prompt_tune.py similarity index 78% rename from graphrag/prompt_tune/cli.py rename to graphrag/cli/prompt_tune.py index 0232b0537c..cbb36e00ba 100644 --- a/graphrag/prompt_tune/cli.py +++ b/graphrag/cli/prompt_tune.py @@ -1,30 +1,35 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""Command line interface for the fine_tune module.""" +"""CLI implementation of prompt-tune subcommand.""" from pathlib import Path import graphrag.api as api from graphrag.config import load_config from graphrag.logging import PrintProgressReporter - -from .generator.community_report_summarization import COMMUNITY_SUMMARIZATION_FILENAME -from .generator.entity_extraction_prompt import ENTITY_EXTRACTION_FILENAME -from .generator.entity_summarization_prompt import ENTITY_SUMMARIZATION_FILENAME +from graphrag.prompt_tune.generator.community_report_summarization import ( + COMMUNITY_SUMMARIZATION_FILENAME, +) +from graphrag.prompt_tune.generator.entity_extraction_prompt import ( + ENTITY_EXTRACTION_FILENAME, +) +from graphrag.prompt_tune.generator.entity_summarization_prompt import ( + ENTITY_SUMMARIZATION_FILENAME, +) async def prompt_tune( - config: str, - root: str, - domain: str, + root: Path, + config: Path | None, + domain: str | None, selection_method: api.DocSelectionType, limit: int, max_tokens: int, chunk_size: int, language: str | None, - skip_entity_types: bool, - output: str, + discover_entity_types: bool, + output: Path, n_subset_max: int, k: int, min_examples_required: int, @@ -41,8 +46,8 @@ async def prompt_tune( - max_tokens: The maximum number of tokens to use on entity extraction prompts. - chunk_size: The chunk token size to use. - language: The language to use for the prompts. - - skip_entity_types: Skip generating entity types. - - output: The output folder to store the prompts. Relative to the root directory. + - discover_entity_types: Generate entity types. + - output: The output folder to store the prompts. - n_subset_max: The number of text chunks to embed when using auto selection method. - k: The number of documents to select when using auto selection method. - min_examples_required: The minimum number of examples required for entity extraction prompts. @@ -60,13 +65,13 @@ async def prompt_tune( domain=domain, language=language, max_tokens=max_tokens, - skip_entity_types=skip_entity_types, + discover_entity_types=discover_entity_types, min_examples_required=min_examples_required, n_subset_max=n_subset_max, k=k, ) - output_path = (root_path / output).resolve() + output_path = output.resolve() if output_path: reporter.info(f"Writing prompts to {output_path}") output_path.mkdir(parents=True, exist_ok=True) diff --git a/graphrag/query/cli.py b/graphrag/cli/query.py similarity index 94% rename from graphrag/query/cli.py rename to graphrag/cli/query.py index 23312ec4d2..d2f5f9e67f 100644 --- a/graphrag/query/cli.py +++ b/graphrag/cli/query.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""Command line interface for the query module.""" +"""CLI implementation of query subcommand.""" import asyncio import sys @@ -19,9 +19,9 @@ def run_global_search( - config_filepath: str | None, - data_dir: str | None, - root_dir: str, + config_filepath: Path | None, + data_dir: Path | None, + root_dir: Path, community_level: int, response_type: str, streaming: bool, @@ -31,10 +31,9 @@ def run_global_search( Loads index files required for global search and calls the Query API. """ - root = Path(root_dir).resolve() + root = root_dir.resolve() config = load_config(root, config_filepath) - - config.storage.base_dir = data_dir or config.storage.base_dir + config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir resolve_paths(config) dataframe_dict = _resolve_parquet_files( @@ -99,9 +98,9 @@ async def run_streaming_search(): def run_local_search( - config_filepath: str | None, - data_dir: str | None, - root_dir: str, + config_filepath: Path | None, + data_dir: Path | None, + root_dir: Path, community_level: int, response_type: str, streaming: bool, @@ -111,10 +110,9 @@ def run_local_search( Loads index files required for local search and calls the Query API. """ - root = Path(root_dir).resolve() + root = root_dir.resolve() config = load_config(root, config_filepath) - - config.storage.base_dir = data_dir or config.storage.base_dir + config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir resolve_paths(config) dataframe_dict = _resolve_parquet_files( @@ -190,7 +188,7 @@ async def run_streaming_search(): def _resolve_parquet_files( - root_dir: str, + root_dir: Path, config: GraphRagConfig, parquet_list: list[str], optional_list: list[str], diff --git a/graphrag/config/load_config.py b/graphrag/config/load_config.py index d46f01bd6b..c4133a7196 100644 --- a/graphrag/config/load_config.py +++ b/graphrag/config/load_config.py @@ -11,8 +11,8 @@ def load_config( - root_dir: str | Path, - config_filepath: str | None = None, + root_dir: Path, + config_filepath: Path | None = None, ) -> GraphRagConfig: """Load configuration from a file or create a default configuration. @@ -27,11 +27,11 @@ def load_config( If None, searches for config file in root and if not found creates a default configuration. """ - root = Path(root_dir).resolve() + root = root_dir.resolve() # If user specified a config file path then it is required if config_filepath: - config_path = Path(config_filepath).resolve() + config_path = config_filepath.resolve() if not config_path.exists(): msg = f"Specified Config file not found: {config_path}" raise FileNotFoundError(msg) diff --git a/graphrag/index/__main__.py b/graphrag/index/__main__.py deleted file mode 100644 index bdf8a63a06..0000000000 --- a/graphrag/index/__main__.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine package root.""" - -import argparse - -from graphrag.logging import ReporterType -from graphrag.utils.cli import dir_exist, file_exist - -from .cli import index_cli -from .emit.types import TableEmitterType - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="python -m graphrag.index", - description="The graphrag indexing engine", - ) - parser.add_argument( - "--config", - help="The configuration yaml file to use when running the indexing pipeline", - type=file_exist, - ) - parser.add_argument( - "-v", - "--verbose", - help="Run the pipeline with verbose logging", - action="store_true", - ) - parser.add_argument( - "--memprofile", - help="Run the pipeline with memory profiling", - action="store_true", - ) - parser.add_argument( - "--root", - help="The root directory to use for input data and output data, if no configuration is defined. Default: current directory", - # Only required if config is not defined - required=False, - default=".", - type=dir_exist, - ) - parser.add_argument( - "--resume", - help="Resume a given data run leveraging Parquet output files", - # Only required if config is not defined - required=False, - default="", - type=str, - ) - parser.add_argument( - "--reporter", - help="The progress reporter to use. Default: rich", - default=ReporterType.RICH, - type=ReporterType, - choices=list(ReporterType), - ) - parser.add_argument( - "--emit", - help="The data formats to emit, comma-separated. Default: parquet", - default=TableEmitterType.Parquet.value, - type=str, - choices=list(TableEmitterType), - ) - parser.add_argument( - "--dryrun", - help="Run the pipeline without executing any steps to inspect/validate the configuration", - action="store_true", - ) - parser.add_argument( - "--nocache", help="Disable LLM cache", action="store_true", default=False - ) - parser.add_argument( - "--init", - help="Create an initial configuration in the given path", - action="store_true", - ) - parser.add_argument( - "--skip-validations", - help="Skip any preflight validation. Useful when running no LLM steps", - action="store_true", - ) - parser.add_argument( - "--update-index", - help="Update a given index run id, leveraging previous outputs and applying new indexes", - # Only required if config is not defined - required=False, - default=None, - type=str, - ) - parser.add_argument( - "--output", - help="The output directory to use for the pipeline.", - required=False, - default=None, - type=str, - ) - args = parser.parse_args() - - if args.resume and args.update_index: - msg = "Cannot resume and update a run at the same time" - raise ValueError(msg) - - index_cli( - root_dir=args.root, - verbose=args.verbose, - resume=args.resume, - update_index_id=args.update_index, - memprofile=args.memprofile, - nocache=args.nocache, - reporter=args.reporter, - config_filepath=args.config, - emit=[TableEmitterType(value) for value in args.emit.split(",")], - dryrun=args.dryrun, - init=args.init, - skip_validations=args.skip_validations, - output_dir=args.output, - ) diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py index 6f3e67217a..38adc396ff 100644 --- a/graphrag/index/run/run.py +++ b/graphrag/index/run/run.py @@ -8,6 +8,7 @@ import time import traceback from collections.abc import AsyncIterable +from pathlib import Path from typing import cast import pandas as pd @@ -103,7 +104,7 @@ async def run_pipeline_with_config( root_dir = config.root_dir or "" progress_reporter = progress_reporter or NullProgressReporter() - storage = storage or _create_storage(config.storage, root_dir=root_dir) + storage = storage or _create_storage(config.storage, root_dir=Path(root_dir)) cache = cache or _create_cache(config.cache, root_dir) callbacks = callbacks or _create_reporter(config.reporting, root_dir) dataset = ( diff --git a/graphrag/logging/types.py b/graphrag/logging/types.py index 5b2ef26d23..3ba50e5bd4 100644 --- a/graphrag/logging/types.py +++ b/graphrag/logging/types.py @@ -10,7 +10,7 @@ from datashaper import Progress -class ReporterType(Enum): +class ReporterType(str, Enum): """The type of reporter to use.""" RICH = "rich" diff --git a/graphrag/prompt_tune/__main__.py b/graphrag/prompt_tune/__main__.py deleted file mode 100644 index b55ccb4468..0000000000 --- a/graphrag/prompt_tune/__main__.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The auto templating package root.""" - -import argparse -import asyncio - -from graphrag.api import DocSelectionType -from graphrag.utils.cli import dir_exist, file_exist - -from .cli import prompt_tune -from .generator import MAX_TOKEN_COUNT -from .loader import MIN_CHUNK_SIZE - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="python -m graphrag.prompt_tune", - description="The graphrag auto templating module.", - ) - parser.add_argument( - "--config", - help="Configuration yaml file to use when generating prompts", - required=True, - type=file_exist, - ) - parser.add_argument( - "--root", - help="Data project root. Default: current directory", - default=".", - type=dir_exist, - ) - parser.add_argument( - "--domain", - help="Domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If not defined, the domain will be inferred from the input data.", - type=str, - default="", - ) - parser.add_argument( - "--selection-method", - help=f"Chunk selection method. Default: {DocSelectionType.RANDOM}", - type=DocSelectionType, - choices=list(DocSelectionType), - default=DocSelectionType.RANDOM, - ) - parser.add_argument( - "--n_subset_max", - help="Number of text chunks to embed when using auto selection method. Default: 300", - type=int, - default=300, - ) - parser.add_argument( - "--k", - help="Maximum number of documents to select from each centroid when using auto selection method. Default: 15", - type=int, - default=15, - ) - parser.add_argument( - "--limit", - help="Number of documents to load when doing random or top selection. Default: 15", - type=int, - default=15, - ) - parser.add_argument( - "--max-tokens", - help=f"Max token count for prompt generation. Default: {MAX_TOKEN_COUNT}", - type=int, - default=MAX_TOKEN_COUNT, - ) - parser.add_argument( - "--min-examples-required", - help="Minimum number of examples required in the entity extraction prompt. Default: 2", - type=int, - default=2, - ) - parser.add_argument( - "--chunk-size", - help=f"Max token count for prompt generation. Default: {MIN_CHUNK_SIZE}", - type=int, - default=MIN_CHUNK_SIZE, - ) - parser.add_argument( - "--language", - help="Primary language used for inputs and outputs on GraphRAG", - type=str, - default=None, - ) - parser.add_argument( - "--no-entity-types", - help="Use untyped entity extraction generation", - action="store_true", - ) - parser.add_argument( - "--output", - help="Directory to save generated prompts to, relative to the root directory. Default: 'prompts'", - type=str, - default="prompts", - ) - args = parser.parse_args() - - loop = asyncio.get_event_loop() - loop.run_until_complete( - prompt_tune( - config=args.config, - root=args.root, - domain=args.domain, - selection_method=args.selection_method, - limit=args.limit, - max_tokens=args.max_tokens, - chunk_size=args.chunk_size, - language=args.language, - skip_entity_types=args.no_entity_types, - output=args.output, - n_subset_max=args.n_subset_max, - k=args.k, - min_examples_required=args.min_examples_required, - ) - ) diff --git a/graphrag/prompt_tune/types.py b/graphrag/prompt_tune/types.py index 1207d18767..f3df632452 100644 --- a/graphrag/prompt_tune/types.py +++ b/graphrag/prompt_tune/types.py @@ -6,7 +6,7 @@ from enum import Enum -class DocSelectionType(Enum): +class DocSelectionType(str, Enum): """The type of document selection to use.""" ALL = "all" diff --git a/graphrag/query/__main__.py b/graphrag/query/__main__.py deleted file mode 100644 index 8a7e113cd4..0000000000 --- a/graphrag/query/__main__.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Query Engine package root.""" - -import argparse -from enum import Enum - -from graphrag.utils.cli import dir_exist, file_exist - -from .cli import run_global_search, run_local_search - -INVALID_METHOD_ERROR = "Invalid method" - - -class SearchType(Enum): - """The type of search to run.""" - - LOCAL = "local" - GLOBAL = "global" - - def __str__(self): - """Return the string representation of the enum value.""" - return self.value - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="python -m graphrag.query", - description="The graphrag query engine", - ) - parser.add_argument( - "--config", - help="The configuration yaml file to use when running the query", - required=False, - type=file_exist, - ) - parser.add_argument( - "--data", - help="The path with the output data from the pipeline", - type=dir_exist, - ) - parser.add_argument( - "--root", - help="The data project root. Default value: the current directory", - default=".", - type=dir_exist, - ) - parser.add_argument( - "--method", - help="The method to run", - required=True, - type=SearchType, - choices=list(SearchType), - ) - parser.add_argument( - "--community_level", - help="Community level in the Leiden community hierarchy from which we will load the community reports. A higher value means we will use reports from smaller communities. Default: 2", - type=int, - default=2, - ) - parser.add_argument( - "--response_type", - help="Free form text describing the response type and format, can be anything, e.g. Multiple Paragraphs, Single Paragraph, Single Sentence, List of 3-7 Points, Single Page, Multi-Page Report. Default: Multiple Paragraphs", - type=str, - default="Multiple Paragraphs", - ) - parser.add_argument( - "--streaming", - help="Print response in a streaming manner", - action="store_true", - ) - parser.add_argument( - "query", - nargs=1, - help="The query to run", - type=str, - ) - args = parser.parse_args() - - match args.method: - case SearchType.LOCAL: - run_local_search( - args.config, - args.data, - args.root, - args.community_level, - args.response_type, - args.streaming, - args.query[0], - ) - case SearchType.GLOBAL: - run_global_search( - args.config, - args.data, - args.root, - args.community_level, - args.response_type, - args.streaming, - args.query[0], - ) - case _: - raise ValueError(INVALID_METHOD_ERROR) diff --git a/graphrag/utils/storage.py b/graphrag/utils/storage.py index 26072bda24..60d08b6309 100644 --- a/graphrag/utils/storage.py +++ b/graphrag/utils/storage.py @@ -20,7 +20,7 @@ def _create_storage( - config: PipelineStorageConfigTypes | None, root_dir: str + config: PipelineStorageConfigTypes | None, root_dir: Path ) -> PipelineStorage: """Create the storage for the pipeline. @@ -37,7 +37,7 @@ def _create_storage( The pipeline storage. """ return load_storage( - config or PipelineFileStorageConfig(base_dir=str(Path(root_dir) / "output")) + config or PipelineFileStorageConfig(base_dir=str(root_dir / "output")) ) diff --git a/poetry.lock b/poetry.lock index 7d6863122a..105b511d7e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiofiles" @@ -4560,6 +4560,17 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] +[[package]] +name = "shellingham" +version = "1.5.4" +description = "Tool to Detect Surrounding Shell" +optional = false +python-versions = ">=3.7" +files = [ + {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, + {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, +] + [[package]] name = "six" version = "1.16.0" @@ -4882,6 +4893,23 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "typer" +version = "0.12.5" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = false +python-versions = ">=3.7" +files = [ + {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"}, + {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"}, +] + +[package.dependencies] +click = ">=8.0.0" +rich = ">=10.11.0" +shellingham = ">=1.3.0" +typing-extensions = ">=3.7.4.3" + [[package]] name = "types-python-dateutil" version = "2.9.0.20241003" @@ -5172,4 +5200,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "d8bb5501f591421d6c155a4233fc02d5385db695d6316168144123066da3c41f" +content-hash = "0bcb3b8ebe38153edddd48f8077ddf58e4628e7b714731a9fa48785288d206b9" diff --git a/pyproject.toml b/pyproject.toml index 1056f77ae4..c290b5ae2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "graphrag" # Maintainers: do not change the version here manually, use ./scripts/release.sh version = "0.3.6" -description = "" +description = "GraphRAG: A graph-based retrieval-augmented generation (RAG) system." authors = [ "Alonso Guevara Fernández ", "Andrés Morales Esquivel ", @@ -28,6 +28,9 @@ packages = [{ include = "graphrag" }] [tool.poetry.urls] "Source" = "https://github.com/microsoft/graphrag" +[tool.poetry.scripts] +graphrag = "graphrag.cli.main:app" + [tool.poetry-dynamic-versioning] enable = true style = "pep440" @@ -87,6 +90,7 @@ azure-identity = "^1.17.1" json-repair = "^0.30.0" future = "^1.0.0" # Needed until graspologic fixes their dependency +typer = "^0.12.5" [tool.poetry.group.dev.dependencies] coverage = "^7.6.0" @@ -132,9 +136,9 @@ test_integration = "pytest ./tests/integration" test_smoke = "pytest ./tests/smoke" test_notebook = "pytest ./tests/notebook" test_verbs = "pytest ./tests/verbs" -index = "python -m graphrag.index" -query = "python -m graphrag.query" -prompt_tune = "python -m graphrag.prompt_tune" +index = "python -m graphrag index" +query = "python -m graphrag query" +prompt_tune = "python -m graphrag prompt-tune" # Pass in a test pattern test_only = "pytest -s -k" diff --git a/tests/fixtures/min-csv/config.json b/tests/fixtures/min-csv/config.json index 78e601f1ab..5b135145d4 100644 --- a/tests/fixtures/min-csv/config.json +++ b/tests/fixtures/min-csv/config.json @@ -8,7 +8,7 @@ 2500 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_base_entity_graph": { "row_range": [ @@ -34,10 +34,10 @@ "create_final_relationships": { "row_range": [ 1, - 2500 + 6000 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_nodes": { "row_range": [ @@ -52,7 +52,7 @@ "level" ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_communities": { "row_range": [ @@ -60,7 +60,7 @@ 2500 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_community_reports": { "row_range": [ @@ -90,7 +90,7 @@ "entity_ids" ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_documents": { "row_range": [ @@ -98,7 +98,7 @@ 2500 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 } }, "query_config": [ @@ -112,4 +112,4 @@ } ], "slow": false -} \ No newline at end of file +} diff --git a/tests/fixtures/text/config.json b/tests/fixtures/text/config.json index 6e8a2a3ebe..f6d945b9e1 100644 --- a/tests/fixtures/text/config.json +++ b/tests/fixtures/text/config.json @@ -8,7 +8,7 @@ 2500 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_covariates": { "row_range": [ @@ -51,10 +51,10 @@ "create_final_relationships": { "row_range": [ 1, - 2500 + 6000 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_nodes": { "row_range": [ @@ -69,7 +69,7 @@ "level" ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_communities": { "row_range": [ @@ -77,7 +77,7 @@ 2500 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_community_reports": { "row_range": [ @@ -107,7 +107,7 @@ "entity_ids" ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 }, "create_final_documents": { "row_range": [ @@ -115,7 +115,7 @@ 2500 ], "subworkflows": 1, - "max_runtime": 100 + "max_runtime": 150 } }, "query_config": [ @@ -129,4 +129,4 @@ } ], "slow": false -} \ No newline at end of file +} diff --git a/tests/notebook/test_notebooks.py b/tests/notebook/test_notebooks.py index 1db6100c69..9f9d9b1222 100644 --- a/tests/notebook/test_notebooks.py +++ b/tests/notebook/test_notebooks.py @@ -29,7 +29,7 @@ def _notebook_run(filepath: Path): "-y", "--no-prompt", "--stdout", - filepath.absolute().as_posix(), + str(filepath.absolute().resolve()), ] notebook = subprocess.check_output(args) nb = nbformat.reads(notebook, nbformat.current_nbformat) diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index 33b0b6a097..385f32bd22 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -136,7 +136,7 @@ def __run_indexer( "index", "--verbose" if debug else None, "--root", - root.absolute().as_posix(), + root.resolve().as_posix(), "--reporter", "print", ] @@ -229,11 +229,12 @@ def __run_query(self, root: Path, query_config: dict[str, str]): "poe", "query", "--root", - root.absolute().as_posix(), + root.resolve().as_posix(), "--method", query_config["method"], - "--community_level", + "--community-level", str(query_config.get("community_level", 2)), + "--query", query_config["query"], ]