diff --git a/versioned_docs/version-2.0/how_to_guides/datasets/manage_datasets_in_application.mdx b/versioned_docs/version-2.0/how_to_guides/datasets/manage_datasets_in_application.mdx index cf8ac144..41fb9e3c 100644 --- a/versioned_docs/version-2.0/how_to_guides/datasets/manage_datasets_in_application.mdx +++ b/versioned_docs/version-2.0/how_to_guides/datasets/manage_datasets_in_application.mdx @@ -35,7 +35,7 @@ You can do this from any 'run' details page by clicking the 'Add to Dataset' but :::tip An extremely powerful technique to build datasets is to drill-down into the most interesting traces, such as traces that were tagged with poor user feedback, and add them to a dataset. -For tips on how to filter traces, see the [filtering traces] guide. +For tips on how to filter traces, see the [filtering traces](../monitoring/filter_traces_in_application) guide. ::: :::tip automations diff --git a/versioned_docs/version-2.0/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx b/versioned_docs/version-2.0/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx new file mode 100644 index 00000000..2f9ec936 --- /dev/null +++ b/versioned_docs/version-2.0/how_to_guides/evaluation/fetch_perf_metrics_experiment.mdx @@ -0,0 +1,162 @@ +--- +sidebar_position: 10 +--- + +import { + CodeTabs, + PythonBlock, + TypeScriptBlock, +} from "@site/src/components/InstructionsWithCode"; + +# Fetch performance metrics for an experiment + +:::tip Experiments, Projects, and Sessions + +Tracing projects and experiments use the same underlying data structure in our backend, which is called a "session." + +You might see these terms interchangeably in our documentation, but they all refer to the same underlying data structure. + +We are working on unifying the terminology across our documentation and APIs. +::: + +When you run an experiment using `evaluate` with the Python or TypeScript SDK, you can fetch the performance metrics for the experiment using the `read_project`/`readProject` methods. + +The payload for experiment details includes the following values: + +```json +{ + "start_time": "2024-06-06T01:02:51.299960", + "end_time": "2024-06-06T01:03:04.557530+00:00", + "extra": { + "metadata": { + "git": { + "tags": null, + "dirty": true, + "branch": "ankush/agent-eval", + "commit": "...", + "repo_name": "...", + "remote_url": "...", + "author_name": "Ankush Gola", + "commit_time": "...", + "author_email": "..." + }, + "revision_id": null, + "dataset_splits": ["base"], + "dataset_version": "2024-06-05T04:57:01.535578+00:00", + "num_repetitions": 3 + } + }, + "name": "SQL Database Agent-ae9ad229", + "description": null, + "default_dataset_id": null, + "reference_dataset_id": "...", + "id": "...", + "run_count": 9, + "latency_p50": 7.896, + "latency_p99": 13.09332, + "first_token_p50": null, + "first_token_p99": null, + "total_tokens": 35573, + "prompt_tokens": 32711, + "completion_tokens": 2862, + "total_cost": 0.206485, + "prompt_cost": 0.163555, + "completion_cost": 0.04293, + "tenant_id": "...", + "last_run_start_time": "2024-06-06T01:02:51.366397", + "last_run_start_time_live": null, + "feedback_stats": { + "cot contextual accuracy": { + "n": 9, + "avg": 0.6666666666666666, + "values": { + "CORRECT": 6, + "INCORRECT": 3 + } + } + }, + "session_feedback_stats": {}, + "run_facets": [], + "error_rate": 0, + "streaming_rate": 0, + "test_run_number": 11 +} +``` + +From here, you can extract performance metrics such as: + +- `latency_p50`: The 50th percentile latency in seconds. +- `latency_p99`: The 99th percentile latency in seconds. +- `total_tokens`: The total number of tokens used. +- `prompt_tokens`: The number of prompt tokens used. +- `completion_tokens`: The number of completion tokens used. +- `total_cost`: The total cost of the experiment. +- `prompt_cost`: The cost of the prompt tokens. +- `completion_cost`: The cost of the completion tokens. +- `feedback_stats`: The feedback statistics for the experiment. +- `error_rate`: The error rate for the experiment. +- `first_token_p50`: The 50th percentile latency for the time to generate the first token (if using streaming). +- `first_token_p99`: The 99th percentile latency for the time to generate the first token (if using streaming). + +Here is an example of how you can fetch the performance metrics for an experiment using the Python and TypeScript SDKs. + +First, as a prerequisite, we will create a trivial dataset. Here, we only demonstrate this in Python, but you can do the same in TypeScript. +Please view the [how-to guide](./evaluate_llm_application) on evaluation for more details. + +```python +from langsmith import Client + +client = Client() + +# Create a dataset +examples = [ + ("Harrison", "Hello Harrison"), + ("Ankush", "Hello Ankush"), +] + +dataset_name = "HelloDataset" +dataset = client.create_dataset(dataset_name=dataset_name) +inputs, outputs = zip( + *[({"input": text}, {"expected": result}) for text, result in examples] +) +client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) +``` + +Next, we will create an experiment, retrieve the experiment name from the result of `evaluate`, then fetch the performance metrics for the experiment. + + dict: + return {"score": 1, "key": "foo"}\n +from langsmith.evaluation import evaluate\n +results = evaluate( + lambda inputs: "Hello " + inputs["input"], + data=dataset_name, + evaluators=[foo_label], + experiment_prefix="Hello", +)\n +resp = client.read_project(project_name=results.experiment_name, include_stats=True)\n +print(resp.json(indent=2))`), + TypeScriptBlock(`import { Client } from "langsmith"; +import { evaluate } from "langsmith/evaluation"; +import type { EvaluationResult } from "langsmith/evaluation"; +import type { Run, Example } from "langsmith/schemas";\n +// Row-level evaluator +function fooLabel(rootRun: Run, example: Example): EvaluationResult { + return {score: 1, key: "foo"}; +}\n +const client = new Client();\n +const results = await evaluate((inputs) => { + return { output: "Hello " + inputs.input }; +}, { + data: "HelloDataset", + experimentPrefix: "Hello", + evaluators: [fooLabel], +});\n +const resp = await client.readProject({ projectName: results.experimentName, includeStats: true }) +console.log(JSON.stringify(resp, null, 2))`), + ]} +/> diff --git a/versioned_docs/version-2.0/how_to_guides/index.md b/versioned_docs/version-2.0/how_to_guides/index.md index 94c68d09..52129328 100644 --- a/versioned_docs/version-2.0/how_to_guides/index.md +++ b/versioned_docs/version-2.0/how_to_guides/index.md @@ -71,6 +71,7 @@ Get started with LangSmith's tracing features to start adding observability to y - [Trace withouth setting environment variables](./how_to_guides/tracing/trace_with_langchain#trace-without-setting-environment-variables) - [Trace with `Instructor` (Python only)](./how_to_guides/tracing/trace_with_instructor) - [Trace without setting environment variables](./how_to_guides/tracing/trace_without_env_vars) +- [Trace using the LangSmith REST API](./how_to_guides/tracing/trace_with_api) ## Datasets @@ -138,6 +139,7 @@ Evaluate your LLM applications to measure their performance over time. - [In the comparison view](./how_to_guides/evaluation/audit_evaluator_scores#in-the-comparison-view) - [In the runs table](./how_to_guides/evaluation/audit_evaluator_scores#in-the-runs-table) - [In the SDK](./how_to_guides/evaluation/audit_evaluator_scores#in-the-sdk) +- [Fetch performance metrics for an experiment](./how_to_guides/evaluation/fetch_perf_metrics_experiment) ## Human feedback diff --git a/versioned_docs/version-2.0/how_to_guides/tracing/trace_with_api.mdx b/versioned_docs/version-2.0/how_to_guides/tracing/trace_with_api.mdx new file mode 100644 index 00000000..8ca3d946 --- /dev/null +++ b/versioned_docs/version-2.0/how_to_guides/tracing/trace_with_api.mdx @@ -0,0 +1,83 @@ +--- +sidebar_position: 17 +--- + +# Trace using the LangSmith REST API + +It is HIGHLY recommended to use our Python or TypeScript SDKs to send traces to LangSmith. +We have designed these SDKs with several optimizations, including batching and backgrounding, to ensure that your application's performance is not impacted by sending traces to LangSmith. +However, if you are unable to use our SDKs, you can use the LangSmith REST API to send traces. Performance may be impacted if you send traces synchronously in your application. +This guide will show you how to trace a request using the LangSmith REST API. Please view our API documentation [here](https://api.smith.langchain.com/redoc) for a full list of endpoints and request/response schemas. + +:::note +When using the LangSmith REST API, you will need to provide your API key in the request headers as `"x-api-key"`. + +Additionally, you should IGNORE and not set the `dotted_order` and `trace_id` fields in the request body. These fields will be automatically generated by the system. +::: + +The following example shows how you might leverage our API directly in Python. The same principles apply to other languages. + +```python +import openai +import os +import requests +from datetime import datetime +from uuid import uuid4 + +def post_run(run_id, name, run_type, inputs, parent_id=None): + """Function to post a new run to the API.""" + data = { + "id": run_id.hex, + "name": name, + "run_type": run_type, + "inputs": inputs, + "start_time": datetime.utcnow().isoformat(), + } + if parent_id: + data["parent_run_id"] = parent_id.hex + requests.post( + "https://api.smith.langchain.com/runs", + json=data, + headers=headers + ) + +def patch_run(run_id, outputs): + """Function to patch a run with outputs.""" + requests.patch( + f"https://api.smith.langchain.com/runs/{run_id}", + json={ + "outputs": outputs, + "end_time": datetime.utcnow().isoformat(), + }, + headers=headers, + ) + +# Send your API Key in the request headers +headers = {"x-api-key": os.environ["LANGCHAIN_API_KEY"]} + +# This can be a user input to your app +question = "Can you summarize this morning's meetings?" + +# This can be retrieved in a retrieval step +context = "During this morning's meeting, we solved all world conflict." +messages = [ + {"role": "system", "content": "You are a helpful assistant. Please respond to the user's request only based on the given context."}, + {"role": "user", "content": f"Question: {question}\\nContext: {context}"} +] + +# Create parent run +parent_run_id = uuid4() +post_run(parent_run_id, "Chat Pipeline", "chain", {"question": question}) + +# Create child run +child_run_id = uuid4() +post_run(child_run_id, "OpenAI Call", "llm", {"messages": messages}, parent_run_id) + +# Generate a completion +client = openai.Client() +chat_completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages) + +# End runs +patch_run(child_run_id, chat_completion.dict()) +patch_run(parent_run_id, {"answer": chat_completion.choices[0].message.content}) +```