explodinggradients · jjmachan · Oct 14, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 13, 2024
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -7,5 +7,5 @@ build:
   commands:
     - pip install -e .[docs]
     - if [ -n "$GH_TOKEN" ]; then pip install git+https://${GH_TOKEN}@github.com/squidfunk/mkdocs-material-insiders.git; fi
-    - python scripts/gen_ref_pages.py
-    - mkdocs build --site-dir $READTHEDOCS_OUTPUT/html
+    - make ipynb-to-md
+    - mkdocs build --site-dir $READTHEDOCS_OUTPUT/html
diff --git a/Makefile b/Makefile
@@ -33,13 +33,14 @@ test-e2e: ## Run end2end tests
 run-ci: format lint type test ## Running all CI checks
 
 # Docs
-docsite: ## Build and serve documentation
-	@echo "Generating reference pages..."
-	@python scripts/gen_ref_pages.py
-	@mkdocs serve --dirty
 rewrite-docs: ## Use GPT4 to rewrite the documentation
 	@echo "Rewriting the documentation in directory $(DIR)..."
 	@python $(GIT_ROOT)/docs/python alphred.py --directory $(DIR)
+ipynb-to-md: ## Convert ipynb files to md files
+	@python $(GIT_ROOT)/scripts/ipynb_to_md.py
+docsite: ## Build and serve documentation
+	@$(MAKE) ipynb-to-md
+	@mkdocs serve
 
 # Benchmarks
 run-benchmarks-eval: ## Run benchmarks for Evaluation

diff --git a/docs/howtos/applications/_cost.md b/docs/howtos/applications/_cost.md
@@ -0,0 +1,116 @@
+# Understand Cost and Usage of Operations
+
+When using LLMs for evaluation and test set generation, cost will be an important factor. Ragas provides you some tools to help you with that.
+
+## Understanding `TokenUsageParser`
+
+By default Ragas does not calculate the usage of tokens for `evaluate()`. This is because langchain's LLMs do not always return information about token usage in a uniform way. So in order to get the usage data, we have to implement a `TokenUsageParser`. 
+
+A `TokenUsageParser` is function that parses the `LLMResult` or `ChatResult` from langchain models `generate_prompt()` function and outputs `TokenUsage` which Ragas expects.
+
+For an example here is one that will parse OpenAI by using a parser we have defined.
+
+
+```python
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_core.prompt_values import StringPromptValue
+
+gpt4o = ChatOpenAI(model="gpt-4o")
+p = StringPromptValue(text="hai there")
+llm_result = gpt4o.generate_prompt([p])
+
+# lets import a parser for OpenAI
+from ragas.cost import get_token_usage_for_openai
+
+get_token_usage_for_openai(llm_result)
+```
+
+
+
+
+    TokenUsage(input_tokens=9, output_tokens=9, model='')
+
+
+
+You can define your own or import parsers if they are defined. If you would like to suggest parser for LLM providers or contribute your own ones please check out this [issue](https://github.com/explodinggradients/ragas/issues/1151) 🙂.
+
+You can use it for evaluations as so. Using example from [get started](get-started-evaluation) here.
+
+
+```python
+from datasets import load_dataset
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+
+amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
+amnesty_qa
+```
+
+    Repo card metadata block was not found. Setting CardData to empty.
+
+
+
+
+
+    DatasetDict({
+        eval: Dataset({
+            features: ['question', 'ground_truth', 'answer', 'contexts'],
+            num_rows: 20
+        })
+    })
+
+
+
+
+```python
+from ragas import evaluate
+from ragas.cost import get_token_usage_for_openai
+
+result = evaluate(
+    amnesty_qa["eval"],
+    metrics=[
+        context_precision,
+        faithfulness,
+        answer_relevancy,
+        context_recall,
+    ],
+    llm=gpt4o,
+    token_usage_parser=get_token_usage_for_openai,
+)
+```
+
+
+    Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]
+
+
+
+```python
+result.total_tokens()
+```
+
+
+
+
+    TokenUsage(input_tokens=116765, output_tokens=39031, model='')
+
+
+
+You can compute the cost for each run by passing in the cost per token to `Result.total_cost()` function.
+
+In this case GPT-4o costs $5 for 1M input tokens and $15 for 1M output tokens.
+
+
+```python
+result.total_cost(cost_per_input_token=5 / 1e6, cost_per_output_token=15 / 1e6)
+```
+
+
+
+
+    1.1692900000000002
+
+
diff --git a/docs/howtos/customizations/_run_config.md b/docs/howtos/customizations/_run_config.md
@@ -0,0 +1,52 @@
+# RunConfig
+
+The `RunConfig` allows you to pass in the run parameters to functions like `evaluate()` and `TestsetGenerator.generate()`. Depending on your LLM providers rate limits, SLAs and traffic, controlling these parameters can improve the speed and reliablility of Ragas runs.
+
+How to configure the `RunConfig` in
+
+- [Evaluate](#evaluate)
+- [TestsetGenerator]()
+
+## Rate Limits
+
+Ragas leverages parallelism with Async in python but the `RunConfig` has a field called `max_workers` which control the number of concurent requests allowed together. You adjust this to get the maximum concurency your provider allows
+
+
+```python
+from ragas.run_config import RunConfig
+
+# increasing max_workers to 64 and timeout to 60 seconds
+
+my_run_config = RunConfig(max_workers=64, timeout=60)
+```
+
+### Evaluate
+
+
+```python
+from ragas import EvaluationDataset, SingleTurnSample
+from ragas.metrics import Faithfulness
+from datasets import load_dataset
+from ragas import evaluate
+
+dataset = load_dataset("explodinggradients/amnesty_qa", "english_v3")
+
+samples = []
+for row in dataset["eval"]:
+    sample = SingleTurnSample(
+        user_input=row["user_input"],
+        reference=row["reference"],
+        response=row["response"],
+        retrieved_contexts=row["retrieved_contexts"],
+    )
+    samples.append(sample)
+
+eval_dataset = EvaluationDataset(samples=samples)
+metric = Faithfulness()
+
+_ = evaluate(
+    dataset=eval_dataset,
+    metrics=[metric],
+    run_config=my_run_config,
+)
+```
diff --git a/docs/howtos/customizations/metrics/_metrics_language_adaptation.md b/docs/howtos/customizations/metrics/_metrics_language_adaptation.md
@@ -0,0 +1,110 @@
+# Adapting metrics to target language
+
+While using ragas to evaluate LLM application workflows, you may have applications to be evaluated that are in languages other than english. In this case, it is best to adapt your LLM powered evaluation metrics to the target language. One obivous way to do this is to manually change the instruction and demonstration, but this can be time consuming. Ragas here offers automatic language adaptation where you can automatically adapt any metrics to target language by using LLM itself. This notebook demonstrates this with simple example
+
+For the sake of this example, let's choose and metric and inspect the default prompts
+
+
+```python
+from ragas.metrics import SimpleCriteriaScoreWithReference
+
+scorer = SimpleCriteriaScoreWithReference(
+    name="course_grained_score", definition="Score 0 to 5 by similarity"
+)
+```
+
+
+```python
+scorer.get_prompts()
+```
+
+
+
+
+    {'multi_turn_prompt': <ragas.metrics._simple_criteria.MultiTurnSimpleCriteriaWithReferencePrompt at 0x7fcf409c3880>,
+     'single_turn_prompt': <ragas.metrics._simple_criteria.SingleTurnSimpleCriteriaWithReferencePrompt at 0x7fcf409c3a00>}
+
+
+
+As you can see, the instruction and demonstration are both in english. Setting up LLM to be used for this conversion
+
+
+```python
+from ragas.llms import llm_factory
+
+llm = llm_factory()
+```
+
+To view the supported language codes
+
+
+```python
+from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
+
+print(list(RAGAS_SUPPORTED_LANGUAGE_CODES.keys()))
+```
+
+    ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic', 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish', 'french', 'burmese', 'greek', 'italian', 'japanese', 'deutsch', 'kazakh', 'slovak']
+
+
+Now let's adapt it to 'hindi' as the target language using `adapt` method.
+Language adaptation in Ragas works by translating few shot examples given along with the prompts to the target language. Instructions remains in english. 
+
+
+```python
+adapted_prompts = await scorer.adapt_prompts(language="hindi", llm=llm)
+```
+
+Inspect the adapted prompts and make corrections if needed
+
+
+```python
+adapted_prompts
+```
+
+
+
+
+    {'multi_turn_prompt': <ragas.metrics._simple_criteria.MultiTurnSimpleCriteriaWithReferencePrompt at 0x7fcf42bc40a0>,
+     'single_turn_prompt': <ragas.metrics._simple_criteria.SingleTurnSimpleCriteriaWithReferencePrompt at 0x7fcf722de890>}
+
+
+
+set the prompts to new adapted prompts using `set_prompts` method
+
+
+```python
+scorer.set_prompts(**adapted_prompts)
+```
+
+Evaluate using adapted metrics
+
+
+```python
+from ragas.dataset_schema import SingleTurnSample
+
+sample = SingleTurnSample(
+    user_input="एफिल टॉवर कहाँ स्थित है?",
+    response="एफिल टॉवर पेरिस में स्थित है।",
+    reference="एफिल टॉवर मिस्र में स्थित है",
+)
+
+scorer.llm = llm
+await scorer.single_turn_ascore(sample)
+```
+
+
+
+
+    0
+
+
+
+Trace of reasoning and score
+
+`{
+    "reason": "प्रतिक्रिया और संदर्भ के उत्तर में स्थान के संदर्भ में महत्वपूर्ण भिन्नता है।",
+    "score": 0
+}`
+
+