-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add total number of runs to logs & small fixes #72
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,7 +129,7 @@ | |
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"**Check if GPU is available**" | ||
"Check if **GPU** is available" | ||
] | ||
}, | ||
{ | ||
|
@@ -165,7 +165,7 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install -q -r ../requirements.txt --disable-pip-version-check && echo \"Success\"" | ||
"%pip install -q -r ../requirements.txt --disable-pip-version-check && echo \"Success\"" | ||
] | ||
}, | ||
{ | ||
|
@@ -320,11 +320,11 @@ | |
"outputs": [], | ||
"source": [ | ||
"searchable_index_params = {\n", | ||
" 'chunk_size' : [192, 256, 320],\n", | ||
" 'chunk_overlap' : [64, 128, 192],\n", | ||
" 'chunk_size' : [128, 192, 256, 320],\n", | ||
" 'chunk_overlap' : [32, 64, 96, 128, 160, 192],\n", | ||
"}\n", | ||
"searchable_shared_params = {\n", | ||
" 'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\")]\n", | ||
" 'embed_model' : [(\"huggingface\",\"all-MiniLM-L6-v2\"),(\"huggingface\", \"BAAI/bge-base-en-v1.5\")]\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we re-introducing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will add it as a comment |
||
"}\n", | ||
"searchable_eval_params = {\n", | ||
" 'retrieval_top_k' : [2, 4, 8]\n", | ||
|
@@ -355,7 +355,7 @@ | |
"shared_args = {\n", | ||
" \"base_path\" : \"./data\", # where data goes\n", | ||
" \"embed_api_key\" : {},\n", | ||
" \"weaviate_url\" : f\"http://{get_host_ip()}:8081\" # IP address\n", | ||
" \"weaviate_url\" : f\"http://{get_host_ip()}:8081\"\n", | ||
"}\n", | ||
"index_args = {\n", | ||
" \"n_rows_to_load\" : 1000,\n", | ||
|
@@ -366,8 +366,8 @@ | |
" \"llm_module_name\": \"langchain.chat_models\",\n", | ||
" \"llm_class_name\": \"ChatOpenAI\",\n", | ||
" \"llm_kwargs\": {\n", | ||
" \"openai_api_key\": \"\" , # TODO: Update with your key or use a different model\n", | ||
" \"model_name\" : \"gpt-3.5-turbo\"\n", | ||
" \"openai_api_key\": \"\" , # TODO: update with your key or use a different model\n", | ||
" \"model_name\" : \"gpt-3.5-turbo\" # choose model, e.g. \"gpt-4\", \"gpt-3.5-turbo\"\n", | ||
" },\n", | ||
" \"evaluation_metrics\" : [\"context_precision\", \"context_relevancy\"]\n", | ||
"}\n", | ||
|
@@ -400,7 +400,9 @@ | |
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from utils import ParameterSearch\n", | ||
|
@@ -417,6 +419,7 @@ | |
" search_method = search_method,\n", | ||
" target_metric = target_metric,\n", | ||
" evaluation_set_path=evaluation_set_path,\n", | ||
" debug=True,\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be false I suppose, or do we leave it on for the end user |
||
")\n", | ||
"\n", | ||
"results = mysearch.run(weaviate_client)" | ||
|
@@ -465,7 +468,7 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install -q \"plotly\" --disable-pip-version-check && echo \"Plotly installed successfully\"" | ||
"%pip install -q \"plotly\" --disable-pip-version-check && echo \"Plotly installed successfully\"" | ||
] | ||
}, | ||
{ | ||
|
@@ -483,8 +486,8 @@ | |
"source": [ | ||
"from utils import add_embed_model_numerical_column, show_legend_embed_models\n", | ||
"\n", | ||
"parameter_search_results = add_embed_model_numerical_column(parameter_search_results)\n", | ||
"show_legend_embed_models(parameter_search_results)" | ||
"results = add_embed_model_numerical_column(results)\n", | ||
"show_legend_embed_models(results)" | ||
] | ||
}, | ||
{ | ||
|
@@ -503,7 +506,7 @@ | |
"import plotly.express as px\n", | ||
"\n", | ||
"dimensions = ['chunk_size', 'chunk_overlap', 'embed_model_numerical', 'retrieval_top_k', 'context_precision']\n", | ||
"fig = px.parallel_coordinates(parameter_search_results, color=\"context_precision\",\n", | ||
"fig = px.parallel_coordinates(results, color=\"context_precision\",\n", | ||
" dimensions=dimensions,\n", | ||
" color_continuous_scale=px.colors.sequential.Bluered)\n", | ||
"fig.show()" | ||
|
@@ -614,7 +617,7 @@ | |
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
"version": "3.9.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,11 +173,12 @@ def run(self, weaviate_client: weaviate.Client): | |
while True: | ||
configs = self.create_configs(run_count) | ||
|
||
# stop running when out of configs | ||
if configs is None: | ||
break | ||
|
||
# create configs | ||
indexing_config, evaluation_config = configs | ||
indexing_config, evaluation_config, n_runs = configs | ||
|
||
# create pipeline objects | ||
indexing_pipeline, evaluation_pipeline = self.create_pipelines( | ||
|
@@ -186,19 +187,23 @@ def run(self, weaviate_client: weaviate.Client): | |
) | ||
|
||
# run indexing pipeline | ||
self.run_indexing_pipeline(run_count, indexing_config, indexing_pipeline) | ||
|
||
self.runner.run(indexing_pipeline) | ||
check_weaviate_class_exists( | ||
weaviate_client, | ||
indexing_config["weaviate_class"], | ||
) | ||
|
||
logger.info( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probable be above line 190 before we start the run There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense |
||
f"Starting indexing pipeline of RUN {run_count}/{n_runs} with {indexing_config}") | ||
|
||
# run evaluation pipeline | ||
self.run_evaluation_pipeline( | ||
run_count, | ||
evaluation_config, | ||
evaluation_pipeline, | ||
self.runner.run(input=evaluation_pipeline, | ||
extra_volumes=self.extra_volumes, | ||
) | ||
|
||
logger.info( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above |
||
f"Starting evaluation pipeline of run #{run_count} / {n_runs} with {evaluation_config}") | ||
|
||
# read metrics from pipeline output | ||
metrics = {} | ||
|
@@ -222,6 +227,7 @@ def create_configs(self, run_count: int): | |
if self.search_method == "grid_search": | ||
# all possible combinations of parameters | ||
all_combinations = list(cartesian_product(self.searchable_params)) | ||
n_runs = len(all_combinations) | ||
|
||
# when all combinations have been tried, stop searching | ||
if run_count > len(all_combinations) - 1: | ||
|
@@ -252,9 +258,8 @@ def create_configs(self, run_count: int): | |
keys_to_try.append(key) | ||
values_to_try.append(option) | ||
step += 1 | ||
variations_to_try = [ | ||
{keys_to_try[i]: values_to_try[i]} for i in range(len(keys_to_try)) | ||
] | ||
variations_to_try = [{keys_to_try[i]: values_to_try[i]} for i in range(len(keys_to_try))] | ||
n_runs = len(variations_to_try) + 1 | ||
|
||
# if there are no variations to try, just schedule one run | ||
if len(variations_to_try) == 0: | ||
|
@@ -315,7 +320,7 @@ def create_configs(self, run_count: int): | |
"embed_model" | ||
] = indexing_config["embed_model"][1] | ||
|
||
return indexing_config, evaluation_config | ||
return indexing_config, evaluation_config, n_runs | ||
|
||
def create_pipelines(self, indexing_config, evaluation_config): | ||
# create indexing pipeline | ||
|
@@ -351,21 +356,4 @@ def create_pipelines(self, indexing_config, evaluation_config): | |
logger.info("\nEvaluation pipeline parameters:") | ||
logger.info({**self.shared_args, **self.eval_args, **evaluation_config}) | ||
|
||
return indexing_pipeline, evaluation_pipeline | ||
|
||
def run_indexing_pipeline(self, run_count, indexing_config, indexing_pipeline): | ||
logger.info( | ||
f"Starting indexing pipeline of run #{run_count} with {indexing_config}", | ||
) | ||
self.runner.run(indexing_pipeline) | ||
|
||
def run_evaluation_pipeline( | ||
self, | ||
run_count, | ||
evaluation_config, | ||
evaluation_pipeline, | ||
): | ||
logger.info( | ||
f"Starting evaluation pipeline of run #{run_count} with {evaluation_config}", | ||
) | ||
self.runner.run(input=evaluation_pipeline, extra_volumes=self.extra_volumes) | ||
return indexing_pipeline, evaluation_pipeline |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a risk that this will run for a very long time?
I think the best balance might be in providing few values with large intervals between them e.g. [32, 128, 192] in order to show the contrast in the final score with a relatively short time to complete. We want to minimize the time of wait + API costs for the end user
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree