Skip to content

Commit

Permalink
use default-config settings in init-content
Browse files Browse the repository at this point in the history
  • Loading branch information
darthtrevino committed Apr 5, 2024
1 parent 006ad3c commit a499bc8
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 49 deletions.
3 changes: 3 additions & 0 deletions graphrag/index/default_config/parameters/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

"""Common default configuration values."""

from datashaper import AsyncType

from graphrag.index.config import (
PipelineCacheType,
PipelineInputStorageType,
Expand All @@ -12,6 +14,7 @@
from graphrag.index.default_config.parameters.models import TextEmbeddingTarget
from graphrag.index.llm.types import LLMType

DEFAULT_ASYNC_MODE = AsyncType.Threaded
#
# LLM Parameters
#
Expand Down
141 changes: 92 additions & 49 deletions graphrag/index/init_content.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,127 @@
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
"""Content for the init CLI command."""

INIT_YAML = """
from graphrag.index.default_config.parameters.defaults import (
DEFAULT_ASYNC_MODE,
DEFAULT_CACHE_BASE_DIR,
DEFAULT_CACHE_TYPE,
DEFAULT_CHUNK_GROUP_BY_COLUMNS,
DEFAULT_CHUNK_OVERLAP,
DEFAULT_CHUNK_SIZE,
DEFAULT_CLAIM_DESCRIPTION,
DEFAULT_CLAIM_MAX_GLEANINGS,
DEFAULT_COMMUNITY_REPORT_MAX_INPUT_LENGTH,
DEFAULT_COMMUNITY_REPORT_MAX_LENGTH,
DEFAULT_EMBEDDING_BATCH_MAX_TOKENS,
DEFAULT_EMBEDDING_BATCH_SIZE,
DEFAULT_EMBEDDING_CONCURRENT_REQUESTS,
DEFAULT_EMBEDDING_MAX_RETRIES,
DEFAULT_EMBEDDING_MAX_RETRY_WAIT,
DEFAULT_EMBEDDING_MODEL,
DEFAULT_EMBEDDING_TARGET,
DEFAULT_EMBEDDING_TYPE,
DEFAULT_ENTITY_EXTRACTION_ENTITY_TYPES,
DEFAULT_INPUT_BASE_DIR,
DEFAULT_INPUT_FILE_ENCODING,
DEFAULT_INPUT_TYPE,
DEFAULT_LLM_CONCURRENT_REQUESTS,
DEFAULT_LLM_MAX_RETRIES,
DEFAULT_LLM_MAX_RETRY_WAIT,
DEFAULT_LLM_MAX_TOKENS,
DEFAULT_LLM_MODEL,
DEFAULT_LLM_REQUEST_TIMEOUT,
DEFAULT_LLM_TYPE,
DEFAULT_MAX_CLUSTER_SIZE,
DEFAULT_NODE2VEC_ITERATIONS,
DEFAULT_NODE2VEC_NUM_WALKS,
DEFAULT_NODE2VEC_RANDOM_SEED,
DEFAULT_NODE2VEC_WALK_LENGTH,
DEFAULT_NODE2VEC_WINDOW_SIZE,
DEFAULT_PARALLELIZATION_NUM_THREADS,
DEFAULT_PARALLELIZATION_STAGGER,
DEFAULT_REPORTING_BASE_DIR,
DEFAULT_REPORTING_TYPE,
DEFAULT_STORAGE_BASE_DIR,
DEFAULT_STORAGE_TYPE,
DEFAULT_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH,
)

INIT_YAML = f"""
encoding_model: cl100k_base
skip_workflows: []
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_chat # or azure_openai_chat
model: gpt-4-turbo-preview
api_key: ${{GRAPHRAG_API_KEY}}
type: {DEFAULT_LLM_TYPE.value} # or azure_openai_chat
model: {DEFAULT_LLM_MODEL}
model_supports_json: true # recommended if this is available for your model.
# max_tokens: 4000
# request_timeout: 180.0
# max_tokens: {DEFAULT_LLM_MAX_TOKENS}
# request_timeout: {DEFAULT_LLM_REQUEST_TIMEOUT}
# api_base: https://<instance>.openai.azure.com
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
# max_retries: 10
# max_retry_wait: 10.0
# max_retries: {DEFAULT_LLM_MAX_RETRIES}
# max_retry_wait: {DEFAULT_LLM_MAX_RETRY_WAIT}
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
# concurrent_requests: 25 # the number of parallel inflight requests that may be made
# concurrent_requests: {DEFAULT_LLM_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made
parallelization:
stagger: 0.3
# num_threads: 50 # the number of threads to use for parallel processing
stagger: {DEFAULT_PARALLELIZATION_STAGGER}
# num_threads: {DEFAULT_PARALLELIZATION_NUM_THREADS} # the number of threads to use for parallel processing
async_mode: threaded # or asyncio
async_mode: {DEFAULT_ASYNC_MODE.value} # or asyncio
embeddings:
## parallelization: override the global parallelization settings for embeddings
async_mode: {DEFAULT_ASYNC_MODE.value} # or asyncio
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_embedding # or azure_openai_embedding
model: text-embedding-3-small
api_key: ${{GRAPHRAG_API_KEY}}
type: {DEFAULT_EMBEDDING_TYPE.value} # or azure_openai_embedding
model: {DEFAULT_EMBEDDING_MODEL}
# api_base: https://<instance>.openai.azure.com
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
# max_retries: 10
# max_retry_wait: 10.0
# max_retries: {DEFAULT_EMBEDDING_MAX_RETRIES}
# max_retry_wait: {DEFAULT_EMBEDDING_MAX_RETRY_WAIT}
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
# concurrent_requests: 25 # the number of parallel inflight requests that may be made
# batch_size: 16 # the number of documents to send in a single request
# batch_max_tokens: 8191 # the maximum number of tokens to send in a single request
# target: required # or optional
# concurrent_requests: {DEFAULT_EMBEDDING_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made
# batch_size: {DEFAULT_EMBEDDING_BATCH_SIZE} # the number of documents to send in a single request
# batch_max_tokens: {DEFAULT_EMBEDDING_BATCH_MAX_TOKENS} # the maximum number of tokens to send in a single request
# target: {DEFAULT_EMBEDDING_TARGET.value} # or optional
parallelization:
stagger: 0.3
# num_threads: 50 # the number of threads to use for parallel processing
async_mode: threaded # or asyncio
chunks:
size: 300
overlap: 100
group_by_columns: ["id"] # by default, we don't allow chunks to cross documents
size: {DEFAULT_CHUNK_SIZE}
overlap: {DEFAULT_CHUNK_OVERLAP}
group_by_columns: [{",".join(DEFAULT_CHUNK_GROUP_BY_COLUMNS)}] # by default, we don't allow chunks to cross documents
input:
type: csv
base_dir: input
file_encoding: utf-8
type: {DEFAULT_INPUT_TYPE}
base_dir: "{DEFAULT_INPUT_BASE_DIR}"
file_encoding: {DEFAULT_INPUT_FILE_ENCODING}
file_pattern: ".*\\\\.csv$"
cache:
type: file # or blob
base_dir: cache
type: {DEFAULT_CACHE_TYPE.value} # or blob
base_dir: "{DEFAULT_CACHE_BASE_DIR}"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>
storage:
type: file # or blob
base_dir: "output/${timestamp}/artifacts"
type: {DEFAULT_STORAGE_TYPE.value} # or blob
base_dir: "{DEFAULT_STORAGE_BASE_DIR}"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>
reporting:
type: file # or console, blob
base_dir: "output/${timestamp}/reports"
type: {DEFAULT_REPORTING_TYPE} # or console, blob
base_dir: "{DEFAULT_REPORTING_BASE_DIR}"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>
Expand All @@ -87,42 +130,42 @@
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/entity_extraction.txt"
entity_types: ["organization", "person", "geo", "event"]
entity_types: [{",".join(DEFAULT_ENTITY_EXTRACTION_ENTITY_TYPES)}]
max_gleanings: 0
summarize_descriptions:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/summarize_descriptions.txt"
max_length: 500
max_length: {DEFAULT_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH}
claim_extraction:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/claim_extraction.txt"
description: "Any claims or facts that could be relevant to information discovery."
max_gleanings: 0
description: "{DEFAULT_CLAIM_DESCRIPTION}"
max_gleanings: {DEFAULT_CLAIM_MAX_GLEANINGS}
community_report:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/community_report.txt"
max_length: 1500
max_input_length: 12_000
max_length: {DEFAULT_COMMUNITY_REPORT_MAX_LENGTH}
max_input_length: {DEFAULT_COMMUNITY_REPORT_MAX_INPUT_LENGTH}
cluster_graph:
max_cluster_size: 10
max_cluster_size: {DEFAULT_MAX_CLUSTER_SIZE}
embed_graph:
is_enabled: false # if true, will generate node2vec embeddings for nodes
# num_walks: 10
# walk_length: 40
# window_size: 2
# iterations: 3
# random_seed: 597832
# num_walks: {DEFAULT_NODE2VEC_NUM_WALKS}
# walk_length: {DEFAULT_NODE2VEC_WALK_LENGTH}
# window_size: {DEFAULT_NODE2VEC_WINDOW_SIZE}
# iterations: {DEFAULT_NODE2VEC_ITERATIONS}
# random_seed: {DEFAULT_NODE2VEC_RANDOM_SEED}
umap:
is_enabled: false # if true, will generate UMAP embeddings for nodes
Expand Down

0 comments on commit a499bc8

Please sign in to comment.