use default-config settings in init-content

microsoft · Apr 5, 2024 · a499bc8 · a499bc8
1 parent 006ad3c
commit a499bc8
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 49 deletions.
diff --git a/graphrag/index/default_config/parameters/defaults.py b/graphrag/index/default_config/parameters/defaults.py
@@ -2,6 +2,8 @@
 
 """Common default configuration values."""
 
+from datashaper import AsyncType
+
 from graphrag.index.config import (
     PipelineCacheType,
     PipelineInputStorageType,
@@ -12,6 +14,7 @@
 from graphrag.index.default_config.parameters.models import TextEmbeddingTarget
 from graphrag.index.llm.types import LLMType
 
+DEFAULT_ASYNC_MODE = AsyncType.Threaded
 #
 # LLM Parameters
 #

diff --git a/graphrag/index/init_content.py b/graphrag/index/init_content.py
@@ -1,84 +1,127 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 """Content for the init CLI command."""
 
-INIT_YAML = """
+from graphrag.index.default_config.parameters.defaults import (
+    DEFAULT_ASYNC_MODE,
+    DEFAULT_CACHE_BASE_DIR,
+    DEFAULT_CACHE_TYPE,
+    DEFAULT_CHUNK_GROUP_BY_COLUMNS,
+    DEFAULT_CHUNK_OVERLAP,
+    DEFAULT_CHUNK_SIZE,
+    DEFAULT_CLAIM_DESCRIPTION,
+    DEFAULT_CLAIM_MAX_GLEANINGS,
+    DEFAULT_COMMUNITY_REPORT_MAX_INPUT_LENGTH,
+    DEFAULT_COMMUNITY_REPORT_MAX_LENGTH,
+    DEFAULT_EMBEDDING_BATCH_MAX_TOKENS,
+    DEFAULT_EMBEDDING_BATCH_SIZE,
+    DEFAULT_EMBEDDING_CONCURRENT_REQUESTS,
+    DEFAULT_EMBEDDING_MAX_RETRIES,
+    DEFAULT_EMBEDDING_MAX_RETRY_WAIT,
+    DEFAULT_EMBEDDING_MODEL,
+    DEFAULT_EMBEDDING_TARGET,
+    DEFAULT_EMBEDDING_TYPE,
+    DEFAULT_ENTITY_EXTRACTION_ENTITY_TYPES,
+    DEFAULT_INPUT_BASE_DIR,
+    DEFAULT_INPUT_FILE_ENCODING,
+    DEFAULT_INPUT_TYPE,
+    DEFAULT_LLM_CONCURRENT_REQUESTS,
+    DEFAULT_LLM_MAX_RETRIES,
+    DEFAULT_LLM_MAX_RETRY_WAIT,
+    DEFAULT_LLM_MAX_TOKENS,
+    DEFAULT_LLM_MODEL,
+    DEFAULT_LLM_REQUEST_TIMEOUT,
+    DEFAULT_LLM_TYPE,
+    DEFAULT_MAX_CLUSTER_SIZE,
+    DEFAULT_NODE2VEC_ITERATIONS,
+    DEFAULT_NODE2VEC_NUM_WALKS,
+    DEFAULT_NODE2VEC_RANDOM_SEED,
+    DEFAULT_NODE2VEC_WALK_LENGTH,
+    DEFAULT_NODE2VEC_WINDOW_SIZE,
+    DEFAULT_PARALLELIZATION_NUM_THREADS,
+    DEFAULT_PARALLELIZATION_STAGGER,
+    DEFAULT_REPORTING_BASE_DIR,
+    DEFAULT_REPORTING_TYPE,
+    DEFAULT_STORAGE_BASE_DIR,
+    DEFAULT_STORAGE_TYPE,
+    DEFAULT_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH,
+)
+
+INIT_YAML = f"""
 encoding_model: cl100k_base
 skip_workflows: []
 llm:
-  api_key: ${GRAPHRAG_API_KEY}
-  type: openai_chat # or azure_openai_chat
-  model: gpt-4-turbo-preview
+  api_key: ${{GRAPHRAG_API_KEY}}
+  type: {DEFAULT_LLM_TYPE.value} # or azure_openai_chat
+  model: {DEFAULT_LLM_MODEL}
   model_supports_json: true # recommended if this is available for your model.
-  # max_tokens: 4000
-  # request_timeout: 180.0
+  # max_tokens: {DEFAULT_LLM_MAX_TOKENS}
+  # request_timeout: {DEFAULT_LLM_REQUEST_TIMEOUT}
   # api_base: https://<instance>.openai.azure.com
   # api_version: 2024-02-15-preview
   # organization: <organization_id>
   # deployment_name: <azure_model_deployment_name>
   # tokens_per_minute: 150_000 # set a leaky bucket throttle
   # requests_per_minute: 10_000 # set a leaky bucket throttle
-  # max_retries: 10
-  # max_retry_wait: 10.0
+  # max_retries: {DEFAULT_LLM_MAX_RETRIES}
+  # max_retry_wait: {DEFAULT_LLM_MAX_RETRY_WAIT}
   # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
-  # concurrent_requests: 25 # the number of parallel inflight requests that may be made
+  # concurrent_requests: {DEFAULT_LLM_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made
 
 parallelization:
-  stagger: 0.3
-  # num_threads: 50 # the number of threads to use for parallel processing
+  stagger: {DEFAULT_PARALLELIZATION_STAGGER}
+  # num_threads: {DEFAULT_PARALLELIZATION_NUM_THREADS} # the number of threads to use for parallel processing
 
-async_mode: threaded # or asyncio
+async_mode: {DEFAULT_ASYNC_MODE.value} # or asyncio
 
 embeddings:
+  ## parallelization: override the global parallelization settings for embeddings
+  async_mode: {DEFAULT_ASYNC_MODE.value} # or asyncio
   llm:
-    api_key: ${GRAPHRAG_API_KEY}
-    type: openai_embedding # or azure_openai_embedding
-    model: text-embedding-3-small
+    api_key: ${{GRAPHRAG_API_KEY}}
+    type: {DEFAULT_EMBEDDING_TYPE.value} # or azure_openai_embedding
+    model: {DEFAULT_EMBEDDING_MODEL}
     # api_base: https://<instance>.openai.azure.com
     # api_version: 2024-02-15-preview
     # organization: <organization_id>
     # deployment_name: <azure_model_deployment_name>
     # tokens_per_minute: 150_000 # set a leaky bucket throttle
     # requests_per_minute: 10_000 # set a leaky bucket throttle
-    # max_retries: 10
-    # max_retry_wait: 10.0
+    # max_retries: {DEFAULT_EMBEDDING_MAX_RETRIES}
+    # max_retry_wait: {DEFAULT_EMBEDDING_MAX_RETRY_WAIT}
     # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
-    # concurrent_requests: 25 # the number of parallel inflight requests that may be made
-    # batch_size: 16 # the number of documents to send in a single request
-    # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request
-    # target: required # or optional
+    # concurrent_requests: {DEFAULT_EMBEDDING_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made
+    # batch_size: {DEFAULT_EMBEDDING_BATCH_SIZE} # the number of documents to send in a single request
+    # batch_max_tokens: {DEFAULT_EMBEDDING_BATCH_MAX_TOKENS} # the maximum number of tokens to send in a single request
+    # target: {DEFAULT_EMBEDDING_TARGET.value} # or optional
   
-  parallelization:
-    stagger: 0.3
-    # num_threads: 50 # the number of threads to use for parallel processing
 
-  async_mode: threaded # or asyncio
 
 chunks:
-  size: 300
-  overlap: 100
-  group_by_columns: ["id"] # by default, we don't allow chunks to cross documents
+  size: {DEFAULT_CHUNK_SIZE}
+  overlap: {DEFAULT_CHUNK_OVERLAP}
+  group_by_columns: [{",".join(DEFAULT_CHUNK_GROUP_BY_COLUMNS)}] # by default, we don't allow chunks to cross documents
     
 input:
-  type: csv
-  base_dir: input
-  file_encoding: utf-8
+  type: {DEFAULT_INPUT_TYPE}
+  base_dir: "{DEFAULT_INPUT_BASE_DIR}"
+  file_encoding: {DEFAULT_INPUT_FILE_ENCODING}
   file_pattern: ".*\\\\.csv$"
 
 cache:
-  type: file # or blob
-  base_dir: cache
+  type: {DEFAULT_CACHE_TYPE.value} # or blob
+  base_dir: "{DEFAULT_CACHE_BASE_DIR}"
   # connection_string: <azure_blob_storage_connection_string>
   # container_name: <azure_blob_storage_container_name>
 
 storage:
-  type: file # or blob
-  base_dir: "output/${timestamp}/artifacts"
+  type: {DEFAULT_STORAGE_TYPE.value} # or blob
+  base_dir: "{DEFAULT_STORAGE_BASE_DIR}"
   # connection_string: <azure_blob_storage_connection_string>
   # container_name: <azure_blob_storage_container_name>
 
 reporting:
-  type: file # or console, blob
-  base_dir: "output/${timestamp}/reports"
+  type: {DEFAULT_REPORTING_TYPE} # or console, blob
+  base_dir: "{DEFAULT_REPORTING_BASE_DIR}"
   # connection_string: <azure_blob_storage_connection_string>
   # container_name: <azure_blob_storage_container_name>
 
@@ -87,42 +130,42 @@
   ## parallelization: override the global parallelization settings for this task
   ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/entity_extraction.txt"
-  entity_types: ["organization", "person", "geo", "event"]
+  entity_types: [{",".join(DEFAULT_ENTITY_EXTRACTION_ENTITY_TYPES)}]
   max_gleanings: 0
 
 summarize_descriptions:
   ## llm: override the global llm settings for this task
   ## parallelization: override the global parallelization settings for this task
   ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/summarize_descriptions.txt"
-  max_length: 500
+  max_length: {DEFAULT_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH}
 
 claim_extraction:
   ## llm: override the global llm settings for this task
   ## parallelization: override the global parallelization settings for this task
   ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/claim_extraction.txt"
-  description: "Any claims or facts that could be relevant to information discovery."
-  max_gleanings: 0
+  description: "{DEFAULT_CLAIM_DESCRIPTION}"
+  max_gleanings: {DEFAULT_CLAIM_MAX_GLEANINGS}
 
 community_report:
   ## llm: override the global llm settings for this task
   ## parallelization: override the global parallelization settings for this task
   ## async_mode: override the global async_mode settings for this task
   prompt: "prompts/community_report.txt"
-  max_length: 1500
-  max_input_length: 12_000
+  max_length: {DEFAULT_COMMUNITY_REPORT_MAX_LENGTH}
+  max_input_length: {DEFAULT_COMMUNITY_REPORT_MAX_INPUT_LENGTH}
 
 cluster_graph:
-  max_cluster_size: 10
+  max_cluster_size: {DEFAULT_MAX_CLUSTER_SIZE}
 
 embed_graph:
   is_enabled: false # if true, will generate node2vec embeddings for nodes
-  # num_walks: 10
-  # walk_length: 40
-  # window_size: 2
-  # iterations: 3
-  # random_seed: 597832
+  # num_walks: {DEFAULT_NODE2VEC_NUM_WALKS}
+  # walk_length: {DEFAULT_NODE2VEC_WALK_LENGTH}
+  # window_size: {DEFAULT_NODE2VEC_WINDOW_SIZE}
+  # iterations: {DEFAULT_NODE2VEC_ITERATIONS}
+  # random_seed: {DEFAULT_NODE2VEC_RANDOM_SEED}
 
 umap:
   is_enabled: false # if true, will generate UMAP embeddings for nodes