[Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash! #550

thusinh1969 · 2024-07-14T11:08:33Z

Describe the bug

GraphRAG parsing parameters missed completely the controlled parameters for LLM, such as temperature, n, top_p. Although these are in the settings.yaml file (described below) but using the verbose mode ưe found that these parameters werre somehow missed parsing.

Steps to reproduce

The verbose mode showing that 3 parameters those should be included (temperature, n, top_p) are completely mising:

🚀 Reading settings from ragtest/settings.yaml
Using default configuration: {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"root_dir": "./ragtest",
"reporting": {
"type": "file",
"base_dir": "output/${timestamp}/reports",
"storage_account_blob_url": null
},
"storage": {
"type": "file",
"base_dir": "output/${timestamp}/artifacts",
"storage_account_blob_url": null
},
"cache": {
"type": "file",
"base_dir": "cache",
"storage_account_blob_url": null
},
"input": {
"type": "file",
"file_type": "text",
"base_dir": "input",
"storage_account_blob_url": null,
"encoding": "utf-8",
"file_pattern": ".*\.txt$",
"file_filter": null,
"source_column": null,
"timestamp_column": null,
"timestamp_format": null,
"text_column": "text",
"title_column": null,
"document_attribute_columns": []
},
"embed_graph": {
"enabled": false,
"num_walks": 10,
"walk_length": 40,
"window_size": 2,
"iterations": 3,
"random_seed": 597832,
"strategy": null
},
"embeddings": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_embedding",
"model": "intfloat/multilingual-e5-large",
"max_tokens": 4000,
"request_timeout": 180.0,
"api_base": "http://localhost:8000/v1",
"api_version": "gemma2-9b-it",
"organization": "REDACTED, length 4",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"batch_size": 16,
"batch_max_tokens": 8191,
"target": "required",
"skip": [],
"vector_store": null,
"strategy": null
},
"chunks": {
"size": 512,
"overlap": 100,
"group_by_columns": [
"id"
],
"strategy": null
},
"snapshots": {
"graphml": false,
"raw_entities": false,
"top_level_nodes": false
},
"entity_extraction": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"prompt": "prompts/entity_extraction.txt",
"entity_types": [
"person",
"partnership",
"owner",
"friend",
"role",
"technology",
"equipment",
"organization",
"tax code",
"event",
"location",
"date",
"factory",
"farm",
"tower",
"resort",
"hotel",
"real estate",
"concept",
"decision",
"article",
"creditor",
"debtor",
"stock owner",
"bond owner",
"fund raiser",
"issuance",
"guarantor",
"investigator",
"convicted",
"arrested"
],
"max_gleanings": 0,
"strategy": null
},
"summarize_descriptions": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"prompt": "prompts/summarize_descriptions.txt",
"max_length": 500,
"strategy": null
},
"community_reports": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"prompt": null,
"max_length": 2000,
"max_input_length": 8000,
"strategy": null
},
"claim_extraction": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"enabled": false,
"prompt": "prompts/claim_extraction.txt",
"description": "c\u00e1c kh\u1eb3ng \u0111\u1ecbnh c\u00f3 minh ch\u1ee9ng li\u00ean quan \u0111\u1ebfn c\u00e1c th\u1ef1c th\u1ec3 (entities),
t\u1ed5 ch\u1ee9c (orgnizations) hay c\u00e1c quan h\u1ec7 (relationships).",
"max_gleanings": 0,
"strategy": null
},
"cluster_graph": {
"max_cluster_size": 10,
"strategy": null
},
"umap": {
"enabled": false
},
"local_search": {
"text_unit_prop": 0.5,
"community_prop": 0.1,
"conversation_history_max_turns": 5,
"top_k_entities": 10,
"top_k_relationships": 10,
"max_tokens": 12000,
"llm_max_tokens": 2000
},
"global_search": {
"max_tokens": 12000,
"data_max_tokens": 12000,
"map_max_tokens": 1000,
"reduce_max_tokens": 2000,
"concurrency": 32
},
"encoding_model": "cl100k_base",
"skip_workflows": []
}

Expected Behavior

GraphRAG should use correctly all parameters that is described in the docs here:
https://microsoft.github.io/graphrag/posts/config/json_yaml/

GraphRAG Config Used

encoding_model: cl100k_base
skip_workflows: []
llm:
api_key: ABC
type: openai_chat # or azure_openai_chat
model: Gemma2_9b_it
model_supports_json: true # recommended if this is available for your model.
max_tokens: 1500
request_timeout: 180.0
api_base: http://localhost:8900/v1
api_version: gemma2-9b-it
organization: EraX
deployment_name: gemma2
max_retries: 1
concurrent_requests: 1 # the number of parallel inflight requests that may be made
temperature: 1.0
top_p: 0.95
n: 1
max_retry_wait: 10.0
sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
tokens_per_minute: 150_000 # set a leaky bucket throttle
requests_per_minute: 10_000 # set a leaky bucket throttle

parallelization:
stagger: 0.3
num_threads: 1 # the number of threads to use for parallel processing

async_mode: threaded # or asyncio

embeddings:

parallelization: override the global parallelization settings for embeddings

async_mode: threaded # or asyncio
llm:
api_key: ABC
type: openai_embedding # or azure_openai_embedding
model: intfloat/multilingual-e5-large
api_base: http://localhost:8000/v1
api_version: gemma2-9b-it
organization: EraX
deployment_name: gemma2
temperature: 1.0
top_p: 0.95
n: 1
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
max_retries: 10
# max_retry_wait: 10.0
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
concurrent_requests: 1 # the number of parallel inflight requests that may be made
# batch_size: 16 # the number of documents to send in a single request
# batch_max_tokens: 8191 # the maximum number of tokens to send in a single request
# target: required # or optional

chunks:
size: 512
overlap: 100
group_by_columns: [id] # by default, we don't allow chunks to cross documents

input:
type: file # or blob
file_type: text # or csv
base_dir: "input"
file_encoding: utf-8
file_pattern: ".*\.txt$"

cache:
type: file # or blob
base_dir: "cache"

connection_string: <azure_blob_storage_connection_string>

container_name: <azure_blob_storage_container_name>

storage:
type: file # or blob
base_dir: "output/${timestamp}/artifacts"

connection_string: <azure_blob_storage_connection_string>

container_name: <azure_blob_storage_container_name>

reporting:
type: file # or console, blob
base_dir: "output/${timestamp}/reports"

connection_string: <azure_blob_storage_connection_string>

container_name: <azure_blob_storage_container_name>

entity_extraction:

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

llm:
n: 1
temperature: 1.0
frequency_penalty: 1.0
top_p: 0.95
prompt: "prompts/entity_extraction.txt"
entity_types: [person, partnership, owner, friend, role, technology, equipment, organization, tax code, event, location, date, factory, farm, tower, resort, hotel, real estate, concept, decision, article, creditor, debtor, stock owner, bond owner, fund raiser, issuance, guarantor, investigator, convicted, arrested]
max_gleanings: 0

summarize_descriptions:

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

llm:
n: 1
temperature: 1.0
frequency_penalty: 1.0
top_p: 0.95
prompt: "prompts/summarize_descriptions.txt"
max_length: 500

claim_extraction:

llm: override the global llm settings for this task

llm:
n: 1
temperature: 1.0
frequency_penalty: 1.0
top_p: 0.95

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

enabled: true

prompt: "prompts/claim_extraction.txt"
description: "các khẳng định có minh chứng liên quan đến các thực thể (entities), tổ chức (orgnizations) hay các quan hệ (relationships)."
max_gleanings: 0

community_report:

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

llm:
n: 1
temperature: 1.0
frequency_penalty: 1.0
top_p: 0.95
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 8000

cluster_graph:
max_cluster_size: 10

embed_graph:
enabled: false # if true, will generate node2vec embeddings for nodes

num_walks: 10

walk_length: 40

window_size: 2

iterations: 3

random_seed: 597832

umap:
enabled: false # if true, will generate UMAP embeddings for nodes

snapshots:
graphml: false
raw_entities: false
top_level_nodes: false

local_search:

text_unit_prop: 0.5

community_prop: 0.1

conversation_history_max_turns: 5

top_k_mapped_entities: 10

top_k_relationships: 10

max_tokens: 12000

global_search:

max_tokens: 12000

data_max_tokens: 12000

map_max_tokens: 1000

reduce_max_tokens: 2000

concurrency: 32

Logs and screenshots

🚀 Reading settings from ragtest/settings.yaml
Using default configuration: {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"root_dir": "./ragtest",
"reporting": {
"type": "file",
"base_dir": "output/${timestamp}/reports",
"storage_account_blob_url": null
},
"storage": {
"type": "file",
"base_dir": "output/${timestamp}/artifacts",
"storage_account_blob_url": null
},
"cache": {
"type": "file",
"base_dir": "cache",
"storage_account_blob_url": null
},
"input": {
"type": "file",
"file_type": "text",
"base_dir": "input",
"storage_account_blob_url": null,
"encoding": "utf-8",
"file_pattern": ".*\.txt$",
"file_filter": null,
"source_column": null,
"timestamp_column": null,
"timestamp_format": null,
"text_column": "text",
"title_column": null,
"document_attribute_columns": []
},
"embed_graph": {
"enabled": false,
"num_walks": 10,
"walk_length": 40,
"window_size": 2,
"iterations": 3,
"random_seed": 597832,
"strategy": null
},
"embeddings": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_embedding",
"model": "intfloat/multilingual-e5-large",
"max_tokens": 4000,
"request_timeout": 180.0,
"api_base": "http://localhost:8000/v1",
"api_version": "gemma2-9b-it",
"organization": "REDACTED, length 4",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"batch_size": 16,
"batch_max_tokens": 8191,
"target": "required",
"skip": [],
"vector_store": null,
"strategy": null
},
"chunks": {
"size": 512,
"overlap": 100,
"group_by_columns": [
"id"
],
"strategy": null
},
"snapshots": {
"graphml": false,
"raw_entities": false,
"top_level_nodes": false
},
"entity_extraction": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"prompt": "prompts/entity_extraction.txt",
"entity_types": [
"person",
"partnership",
"owner",
"friend",
"role",
"technology",
"equipment",
"organization",
"tax code",
"event",
"location",
"date",
"factory",
"farm",
"tower",
"resort",
"hotel",
"real estate",
"concept",
"decision",
"article",
"creditor",
"debtor",
"stock owner",
"bond owner",
"fund raiser",
"issuance",
"guarantor",
"investigator",
"convicted",
"arrested"
],
"max_gleanings": 0,
"strategy": null
},
"summarize_descriptions": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"prompt": "prompts/summarize_descriptions.txt",
"max_length": 500,
"strategy": null
},
"community_reports": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"prompt": null,
"max_length": 2000,
"max_input_length": 8000,
"strategy": null
},
"claim_extraction": {
"llm": {
"api_key": "REDACTED, length 3",
"type": "openai_chat",
"model": "Gemma2_9b_it",
"max_tokens": 1500,
"request_timeout": 180.0,
"api_base": "http://localhost:8900/v1",
"api_version": "gemma2-9b-it",
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": "gemma2",
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 1,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 1
},
"parallelization": {
"stagger": 0.3,
"num_threads": 1
},
"async_mode": "threaded",
"enabled": false,
"prompt": "prompts/claim_extraction.txt",
"description": "c\u00e1c kh\u1eb3ng \u0111\u1ecbnh c\u00f3 minh ch\u1ee9ng li\u00ean quan \u0111\u1ebfn c\u00e1c th\u1ef1c th\u1ec3 (entities),
t\u1ed5 ch\u1ee9c (orgnizations) hay c\u00e1c quan h\u1ec7 (relationships).",
"max_gleanings": 0,
"strategy": null
},
"cluster_graph": {
"max_cluster_size": 10,
"strategy": null
},
"umap": {
"enabled": false
},
"local_search": {
"text_unit_prop": 0.5,
"community_prop": 0.1,
"conversation_history_max_turns": 5,
"top_k_entities": 10,
"top_k_relationships": 10,
"max_tokens": 12000,
"llm_max_tokens": 2000
},
"global_search": {
"max_tokens": 12000,
"data_max_tokens": 12000,
"map_max_tokens": 1000,
"reduce_max_tokens": 2000,
"concurrency": 32
},
"encoding_model": "cl100k_base",
"skip_workflows": []
}

Additional Information

All latest version installed today 14 July 2024.

Thanks,
Steve

zanderjiang · 2024-07-14T13:42:49Z

These parameters were not included in the configuration initialization. It will be fixed in the next release. For the mean time, try running from source code using poetry if you need to adjust the parameters. https://microsoft.github.io/graphrag/posts/developing

natoverse · 2024-07-22T20:31:00Z

Consolidating alternate model issues here: #657

thusinh1969 added bug Something isn't working triage Default label assignment, indicates new issue needs reviewed by a maintainer labels Jul 14, 2024

thusinh1969 changed the title ~~[Bug]: Extra LLM parameters are ignore (temperature, n, top_p) caused vLLM to crash!~~ [Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash! Jul 14, 2024

natoverse added community_support Issue handled by community members and removed bug Something isn't working triage Default label assignment, indicates new issue needs reviewed by a maintainer labels Jul 22, 2024

natoverse closed this as not planned Won't fix, can't repro, duplicate, stale Jul 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash! #550

[Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash! #550

thusinh1969 commented Jul 14, 2024

zanderjiang commented Jul 14, 2024

natoverse commented Jul 22, 2024

[Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash! #550

[Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash! #550

Comments

thusinh1969 commented Jul 14, 2024

Describe the bug

Steps to reproduce

Expected Behavior

GraphRAG Config Used

parallelization: override the global parallelization settings for embeddings

connection_string: <azure_blob_storage_connection_string>

container_name: <azure_blob_storage_container_name>

connection_string: <azure_blob_storage_connection_string>

container_name: <azure_blob_storage_container_name>

connection_string: <azure_blob_storage_connection_string>

container_name: <azure_blob_storage_container_name>

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

enabled: true

llm: override the global llm settings for this task

parallelization: override the global parallelization settings for this task

async_mode: override the global async_mode settings for this task

num_walks: 10

walk_length: 40

window_size: 2

iterations: 3

random_seed: 597832

text_unit_prop: 0.5

community_prop: 0.1

conversation_history_max_turns: 5

top_k_mapped_entities: 10

top_k_relationships: 10

max_tokens: 12000

max_tokens: 12000

data_max_tokens: 12000

map_max_tokens: 1000

reduce_max_tokens: 2000

concurrency: 32

Logs and screenshots

Additional Information

zanderjiang commented Jul 14, 2024

natoverse commented Jul 22, 2024