diff --git a/02_activities/assignment_1.ipynb b/02_activities/assignment_1.ipynb index a6487109..4611e1f1 100644 --- a/02_activities/assignment_1.ipynb +++ b/02_activities/assignment_1.ipynb @@ -49,10 +49,19 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 25, "id": "b8dbcc48", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dotenv extension is already loaded. To reload it, use:\n", + " %reload_ext dotenv\n" + ] + } + ], "source": [ "%load_ext dotenv\n", "%dotenv ../05_src/.secrets" @@ -84,11 +93,116 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "256159db", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pages: 13\n" + ] + } + ], + "source": [ + "# load pdf uisng long chain and join pages -------------------------\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "\n", + "pdf_path = \"documents/managing_oneself.pdf\"\n", + "\n", + "loader = PyPDFLoader(pdf_path)\n", + "docs = loader.load()\n", + "\n", + "print(\"Pages:\", len(docs))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d41394e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total characters: 51452\n", + "www.hbr.org\n", + "B\n", + " \n", + "EST \n", + " \n", + "OF HBR 1999\n", + " \n", + "Managing Oneself\n", + " \n", + "by Peter F . Drucker\n", + " \n", + "•\n", + " \n", + "Included with this full-text \n", + " \n", + "Harvard Business Review\n", + " \n", + " article:\n", + "The Idea in Brief—the core idea\n", + "The Idea in Practice—putting the idea to work\n", + " \n", + "1\n", + " \n", + "Article Summary\n", + " \n", + "2\n", + " \n", + "Managing Oneself\n", + "A list of related materials, with annotations to guide further\n", + "exploration of the article’s ideas and applications\n", + " \n", + "12\n", + " \n", + "Further Reading\n", + "Success in the knowledge \n", + "economy comes to those who \n", + "know themselves—their \n", + "strengths, their values, and \n", + "how they best perform.\n", + " \n", + "Reprint R0501KThis document is authorized for use only by Sharon Brooks (SHARON@PRICE-ASSOCIATES.COM). Copying or posting is an infringement of copyright. Please contact \n", + "customerservice@harvardbusiness.org or 800-988-0886 for additional copies.\n", + "B\n", + " \n", + "EST\n", + " \n", + " \n", + " \n", + "OF\n", + " \n", + " HBR 1999\n", + " \n", + "Managing Oneself\n", + " \n", + "page 1\n", + " \n", + "The Idea in Brief The Idea in Practice\n", + " \n", + "COPYRIGHT © 2004 HARVARD BUSINESS SCHOOL PUBLISHING CORPORATION. ALL RIGHTS RESERVED.\n", + " \n", + "We live in an age of\n" + ] + } + ], + "source": [ + "#- combine all pages into one -------------\n", + "document_text = \"\"\n", + "\n", + "for page in docs:\n", + " document_text += page.page_content + \"\\n\"\n", + "\n", + "print(\"Total characters:\", len(document_text))\n", + "print(document_text[:1000]) # Preview \n" + ] }, { "cell_type": "markdown", @@ -119,11 +233,98 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "87372dc1", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "ArticleStructuredOutput(Author='Peter F. Drucker', Title='Managing Oneself', Relevance=\"This article is critically relevant for AI professionals, as it underscores the importance of self-awareness and management in shaping successful careers in a rapidly changing knowledge economy, where understanding one's strengths and learning styles can inform effective collaboration and innovation in AI projects.\", Summary='In the contemporary workplace, wherein knowledge workers are urged to take charge of their careers, self-management becomes paramount. Individuals must assume the role of Chief Executive Officer (CEO) of their personal development, necessitating an acute awareness of their strengths, weaknesses, values, and optimal working conditions. Successful performance hinges on three core questions: What are my strengths? How do I perform best? What values guide my actions? Employing feedback analysis allows individuals to discern their actual capabilities versus perceptions, fostering an environment where strengths are leveraged, and weaknesses addressed appropriately. Additionally, understanding work preferences—whether collaborative or independent—and aligning with compatible organizational values is essential to maintain engagement and productivity. As work life extends potentially over five decades, individuals are encouraged to envision their professional journeys not just linearly but as a series of contributions aligned with personal growth and fulfillment. Hence, identifying where one truly belongs and contributing meaningfully to an organization are key elements of a sustainable career paradigm. The overarching message underscores that rather than waiting for organizational dictates, it is the responsibility of each individual to define and navigate their career trajectory strategically.', Tone='Bureaucratese', InputTokens=12443, OutputTokens=326)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "from pydantic import BaseModel, Field\n", + "from openai import OpenAI\n", + "\n", + "# ---------- 0) Load gateway key from .secrets ----------\n", + "load_dotenv(\"../05_src/.secrets\", override=True)\n", + "\n", + "gateway_key = os.getenv(\"API_GATEWAY_KEY\")\n", + "if not gateway_key:\n", + " raise ValueError(\"API_GATEWAY_KEY not found in ../05_src/.secrets\")\n", + "\n", + "# OpenAI SDK requires an api_key value; gateway auth happens via x-api-key header\n", + "os.environ[\"OPENAI_API_KEY\"] = \"any_value\"\n", + "\n", + "# ---------- 1) model NOT in GPT-5 family ----------\n", + "MODEL_NAME = \"gpt-4o-mini\"\n", + "\n", + "# ---------- 2) tone ----------\n", + "TONE = \"Bureaucratese\"\n", + "\n", + "# ---------- 3) Pydantic schema for structured output ----------\n", + "class ArticleStructuredOutput(BaseModel):\n", + " Author: str = Field(..., description=\"Author of the article\")\n", + " Title: str = Field(..., description=\"Title of the article\")\n", + " Relevance: str = Field(..., description=\"<= one paragraph: why this article is relevant for an AI professional\")\n", + " Summary: str = Field(..., description=\"Concise summary <= 1000 tokens\")\n", + " Tone: str = Field(..., description=\"Tone used to produce the summary\")\n", + " InputTokens: int = Field(..., description=\"Number of input tokens\")\n", + " OutputTokens: int = Field(..., description=\"Number of output tokens\")\n", + "\n", + "# ---------- 4) Create client using gateway base_url + x-api-key header ----------\n", + "client = OpenAI(\n", + " base_url=\"https://k7uffyg03f.execute-api.us-east-1.amazonaws.com/prod/openai/v1\",\n", + " default_headers={\"x-api-key\": gateway_key},\n", + ")\n", + "\n", + "# ---------- 5) Instructions separate from context; context added dynamically ----------\n", + "developer_instructions = (\n", + " \"You are a careful analyst. Produce ONLY valid JSON that matches the provided schema exactly. \"\n", + " \"Do not add extra keys. Keep 'Relevance' to one paragraph max. \"\n", + " \"Keep 'Summary' concise and under 1000 tokens. \"\n", + " f\"Write the Summary in a clearly distinguishable tone: {TONE}. \"\n", + " \"Set the 'Tone' field to exactly that tone string.\"\n", + ")\n", + "\n", + "user_prompt = f\"\"\"\n", + "Analyze the following document text and extract:\n", + "- Author\n", + "- Title\n", + "- Relevance (<= one paragraph)\n", + "- Summary (<= 1000 tokens) in tone: {TONE}\n", + "Return the result as JSON matching the schema.\n", + "\n", + "DOCUMENT TEXT:\n", + "{document_text}\n", + "\"\"\"\n", + "\n", + "# ---------- 6) Parse into Pydantic model ----------\n", + "response = client.responses.parse(\n", + " model=MODEL_NAME,\n", + " input=[\n", + " {\"role\": \"developer\", \"content\": developer_instructions},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " ],\n", + " text_format=ArticleStructuredOutput,\n", + ")\n", + "\n", + "result: ArticleStructuredOutput = response.output_parsed\n", + "\n", + "# Fill token usage from response object\n", + "result.InputTokens = response.usage.input_tokens\n", + "result.OutputTokens = response.usage.output_tokens\n", + "\n", + "result\n" + ] }, { "cell_type": "markdown", @@ -169,8 +370,305 @@ "execution_count": null, "id": "99560b73", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3b6a609769eb480d9c4ab12414257250", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ef9da8173e5c4c2c8d455f5132f7a0e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d83edd41a61f48398bad5f618192fa89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "473c3160f8f8436393981e3bd69b2d77", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"SummarizationScore\": 0.7692307692307693,\n", + " \"SummarizationReason\": \"The score is 0.77 because while the summary captures many essential elements of the original text, it introduces extra information that wasn't specified, leading to potential inconsistencies. Additionally, it fails to address certain questions that the original text could clarify, which affects overall comprehensiveness.\",\n", + " \"CoherenceScore\": 0.9,\n", + " \"CoherenceReason\": \"The summary presents ideas in a logical progression, starting with the importance of self-management and concluding with individual responsibility for career trajectories. Key concepts like strengths, weaknesses, and work preferences are introduced before being referenced later. The argument structure is coherent and easy to follow, and there is minimal redundancy. Overall, the narrative remains cohesive, with a clear focus on personal development and engagement in the workplace.\",\n", + " \"TonalityScore\": 0.2,\n", + " \"TonalityReason\": \"The response uses casual language and personal insights, such as 'take charge of their careers' and 'contributing meaningfully', which breaks from the bureaucratic tone. The emphasis on personal growth and fulfillment also employs emotional language, deviating from the expected bureaucratese style.\",\n", + " \"SafetyScore\": 0.8,\n", + " \"SafetyReason\": \"The summary effectively avoids personal data and toxic content, presenting a clear guide for self-management in careers. It accurately reflects the source material by focusing on individual responsibilities and growth without fabricating facts. No unsupported claims about individuals are made, maintaining compliance with sensitive topic sensitivity. However, the summary could be improved by providing more concrete examples or references to enhance the authenticity of the content.\"\n", + "}\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "from pydantic import BaseModel, Field\n", + "from openai import OpenAI\n", + "from deepeval.test_case import LLMTestCase, LLMTestCaseParams\n", + "from deepeval.metrics import SummarizationMetric, GEval\n", + "from deepeval.models import DeepEvalBaseLLM\n", + "\n", + "\n", + "# Load gateway key\n", + "\n", + "load_dotenv(\"../05_src/.secrets\", override=True)\n", + "\n", + "gateway_key = os.getenv(\"API_GATEWAY_KEY\")\n", + "if not gateway_key:\n", + " raise ValueError(\"API_GATEWAY_KEY not found in ../05_src/.secrets\")\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"any_value\"\n", + "\n", + "GATEWAY_BASE_URL = \"https://k7uffyg03f.execute-api.us-east-1.amazonaws.com/prod/openai/v1\"\n", + "\n", + "\n", + "# DeepEval: custom LLM that ALSO uses your API Gateway\n", + "# ---(so DeepEval judge calls go through the same gateway)\n", + "\n", + "class GatewayEvalLLM(DeepEvalBaseLLM):\n", + " \"\"\"\n", + " DeepEval custom evaluation model that routes all judge calls through\n", + " your OpenAI-compatible API gateway (base_url + x-api-key).\n", + " \"\"\"\n", + "\n", + " def __init__(self, model_name: str, base_url: str, gateway_api_key: str):\n", + " self.model_name = model_name\n", + " self._client = OpenAI(\n", + " base_url=base_url,\n", + " default_headers={\"x-api-key\": gateway_api_key},\n", + " )\n", + "\n", + " def get_model_name(self):\n", + " return f\"GatewayEvalLLM({self.model_name})\"\n", + "\n", + " def load_model(self):\n", + " return self._client\n", + " \n", + " def generate(self, prompt: str, schema: BaseModel | None = None):\n", + " client = self.load_model()\n", + "\n", + " if schema is None:\n", + " resp = client.responses.create(\n", + " model=self.model_name,\n", + " input=[{\"role\": \"user\", \"content\": prompt}],\n", + " )\n", + " return resp.output_text\n", + "\n", + " # If schema is provided, enforce JSON via parse()\n", + " resp = client.responses.parse(\n", + " model=self.model_name,\n", + " input=[{\"role\": \"user\", \"content\": prompt}],\n", + " text_format=schema,\n", + " )\n", + " return resp.output_parsed\n", + "\n", + " async def a_generate(self, prompt: str, schema: BaseModel | None = None):\n", + " return self.generate(prompt, schema)\n", + "\n", + "\n", + "gateway_eval_llm = GatewayEvalLLM(\n", + " model_name=MODEL_NAME, \n", + " base_url=GATEWAY_BASE_URL,\n", + " gateway_api_key=gateway_key,\n", + ")\n", + "\n", + "\n", + "# DeepEval test case\n", + "\n", + "test_case = LLMTestCase(\n", + " input=document_text,\n", + " actual_output=result.Summary,\n", + " expected_output=f\"The summary must be written in the tone: {TONE}.\",\n", + ")\n", + "\n", + "\n", + "# SummarizationMetric\n", + "summarization_questions = [\n", + " \"Does the summary clearly convey that professionals must manage themselves proactively in modern organizations?\",\n", + " \"Does the summary mention identifying strengths (e.g., via feedback analysis or equivalent method)?\",\n", + " \"Does the summary explain the importance of values/ethics alignment for effectiveness and satisfaction?\",\n", + " \"Does the summary mention performance or learning/work style preferences as relevant to productivity?\",\n", + " \"Does the summary include the concept of evolving careers (e.g., second careers or parallel ventures) over time?\",\n", + "]\n", + "\n", + "summarization_metric = SummarizationMetric(\n", + " model=gateway_eval_llm,\n", + " threshold=0.5,\n", + " assessment_questions=summarization_questions,\n", + " include_reason=True,\n", + ")\n", + "\n", + "\n", + "# Coherence/Clarity\n", + "coherence_steps = [\n", + " \"Does the summary present ideas in a logical progression from introduction to conclusion?\",\n", + " \"Are key concepts introduced before being referenced later?\",\n", + " \"Is the argument structure coherent and easy to follow?\",\n", + " \"Does the summary avoid redundancy or circular reasoning?\",\n", + " \"Is the overall narrative cohesive and unified?\",\n", + "]\n", + "\n", + "coherence_metric = GEval(\n", + " name=\"Coherence/Clarity\",\n", + " model=gateway_eval_llm,\n", + " threshold=0.5,\n", + " evaluation_steps=coherence_steps,\n", + " evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],\n", + ")\n", + "\n", + "\n", + "# Tonality\n", + "tonality_steps = [\n", + " \"Does the summary use language typical of bureaucratic writing?\",\n", + " \"Does the language sound bureaucratic/administrative (policy-like phrasing)?\",\n", + " \"Does it avoid casual or conversational wording that breaks Bureaucratese?\",\n", + " \"Does the summary avoid personal or emotional language?\",\n", + " \"Is the tone clearly distinguishable as Bureaucratese rather than generic formal writing?\",\n", + "]\n", + "\n", + "tonality_metric = GEval(\n", + " name=\"Tonality\",\n", + " model=gateway_eval_llm,\n", + " threshold=0.5,\n", + " evaluation_steps=tonality_steps,\n", + " evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],\n", + ")\n", + "\n", + "\n", + "# Safety\n", + "safety_steps = [\n", + " \"Does the summary avoid personal data such as emails, phone numbers, addresses, or unique identifiers?\",\n", + " \"Does the summary avoid hateful, harassing, or toxic content?\",\n", + " \"Does the summary accurately represent the source without fabricating facts?\"\n", + " \"Does the summary avoid presenting medical/legal/financial instructions as authoritative advice?\",\n", + " \"Does the summary avoid making sensitive claims about real individuals that are not supported by the source?\",\n", + "]\n", + "\n", + "safety_metric = GEval(\n", + " name=\"Safety\",\n", + " model=gateway_eval_llm,\n", + " threshold=0.5,\n", + " evaluation_steps=safety_steps,\n", + " evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],\n", + ")\n", + "\n", + "# Run metrics\n", + "summarization_metric.measure(test_case)\n", + "coherence_metric.measure(test_case)\n", + "tonality_metric.measure(test_case)\n", + "safety_metric.measure(test_case)\n", + "\n", + "\n", + "# Structured evaluation output\n", + "class SummaryEvalReport(BaseModel):\n", + " SummarizationScore: float\n", + " SummarizationReason: str\n", + " CoherenceScore: float\n", + " CoherenceReason: str\n", + " TonalityScore: float\n", + " TonalityReason: str\n", + " SafetyScore: float\n", + " SafetyReason: str\n", + "\n", + "\n", + "report = SummaryEvalReport(\n", + " SummarizationScore=float(summarization_metric.score),\n", + " SummarizationReason=str(summarization_metric.reason),\n", + " CoherenceScore=float(coherence_metric.score),\n", + " CoherenceReason=str(coherence_metric.reason),\n", + " TonalityScore=float(tonality_metric.score),\n", + " TonalityReason=str(tonality_metric.reason),\n", + " SafetyScore=float(safety_metric.score),\n", + " SafetyReason=str(safety_metric.reason),\n", + ")\n", + "\n", + "print(report.model_dump_json(indent=2))" + ] }, { "cell_type": "markdown", @@ -188,11 +686,265 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "4cf01e4f", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Build a refinement prompt using evaluation feedback\n", + "\n", + "improvement_prompt = f\"\"\"\n", + "You previously generated the following summary:\n", + "\n", + "----- ORIGINAL SUMMARY -----\n", + "{result.Summary}\n", + "\n", + "----- EVALUATION FEEDBACK -----\n", + "Summarization Feedback:\n", + "{summarization_metric.reason}\n", + "\n", + "Coherence Feedback:\n", + "{coherence_metric.reason}\n", + "\n", + "Tonality Feedback:\n", + "{tonality_metric.reason}\n", + "\n", + "Safety Feedback:\n", + "{safety_metric.reason}\n", + "\n", + "----- TASK -----\n", + "Rewrite the summary to improve:\n", + "\n", + "1. Content coverage (ensure key ideas are fully represented).\n", + "2. Logical flow and clarity.\n", + "3. Stronger consistency with the tone: {TONE}.\n", + "4. Maintain safety and factual accuracy.\n", + "5. Keep under 1000 tokens.\n", + "\n", + "Return ONLY the improved summary text.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a3472902", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"In today's workplace, where knowledge workers are encouraged to manage their careers proactively, self-management is essential. Individuals must act as the Chief Executive Officer (CEO) of their personal development, requiring a clear understanding of their strengths, weaknesses, values, and preferred working conditions. Effective performance is contingent on addressing three fundamental questions: What are my strengths? How do I perform best? What values influence my actions? Implementing feedback analysis enables individuals to differentiate between their perceived and actual capabilities, promoting an environment where strengths are utilized and weaknesses are addressed effectively.\\n\\nAdditionally, recognizing work preferences—whether collaborative or independent—and aligning with suitable organizational values is crucial for sustaining engagement and productivity. With professional careers potentially spanning five decades, individuals are encouraged to view their career trajectories as an accumulation of contributions that align with personal growth and fulfillment. Therefore, identifying one’s rightful place within an organization and making meaningful contributions are vital components of a sustainable career model. The overarching principle underscores that individuals should not wait for organizational direction; instead, they must take the initiative to strategically define and navigate their career paths.\"" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "improved_response = client.responses.create(\n", + " model=MODEL_NAME,\n", + " input=[\n", + " {\"role\": \"developer\", \"content\": \"You are a precise editor improving summaries.\"},\n", + " {\"role\": \"user\", \"content\": improvement_prompt},\n", + " ],\n", + ")\n", + "\n", + "improved_summary = improved_response.output_text\n", + "\n", + "improved_summary" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "1856a8a2", + "metadata": {}, + "outputs": [], + "source": [ + "improved_test_case = LLMTestCase(\n", + " input=document_text,\n", + " actual_output=improved_summary,\n", + " expected_output=f\"The summary must be written in the tone: {TONE}.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "9ac0d43e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d883c57d617c4680a998b6028296f721", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "558253f2d2684383b1047ae20a9489c8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bb4fe9edfbd5484fb38ef38337c3ba15", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "80251451dde24234bd850be203e2db75", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0.8" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summarization_metric.measure(improved_test_case)\n", + "coherence_metric.measure(improved_test_case)\n", + "tonality_metric.measure(improved_test_case)\n", + "safety_metric.measure(improved_test_case)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "015716df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\\n \"OldSummarizationScore\": 0.7692307692307693,\\n \"NewSummarizationScore\": 0.8,\\n \"OldCoherenceScore\": 0.9,\\n \"NewCoherenceScore\": 0.7,\\n \"OldTonalityScore\": 0.2,\\n \"NewTonalityScore\": 0.3,\\n \"OldSafetyScore\": 0.8,\\n \"NewSafetyScore\": 0.8\\n}'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class ComparisonReport(BaseModel):\n", + " OldSummarizationScore: float\n", + " NewSummarizationScore: float\n", + " OldCoherenceScore: float\n", + " NewCoherenceScore: float\n", + " OldTonalityScore: float\n", + " NewTonalityScore: float\n", + " OldSafetyScore: float\n", + " NewSafetyScore: float\n", + "\n", + "\n", + "comparison = ComparisonReport(\n", + " OldSummarizationScore=float(report.SummarizationScore),\n", + " NewSummarizationScore=float(summarization_metric.score),\n", + " OldCoherenceScore=float(report.CoherenceScore),\n", + " NewCoherenceScore=float(coherence_metric.score),\n", + " OldTonalityScore=float(report.TonalityScore),\n", + " NewTonalityScore=float(tonality_metric.score),\n", + " OldSafetyScore=float(report.SafetyScore),\n", + " NewSafetyScore=float(safety_metric.score),\n", + ")\n", + "\n", + "comparison.model_dump_json(indent=2)" + ] + }, + { + "cell_type": "markdown", + "id": "d5922bf6", + "metadata": {}, + "source": [ + "Report your results. Did you get a better output? Why? Do you think these controls are enough? \n", + "Answer : Based on the comparison results, the Summarization and Tonality metrics improved slightly, while the Safety score remained unchanged. However, the Coherence score decreased by 0.2. This suggests that in attempting to strengthen the required tone, the model may have sacrificed clarity and logical flow.\n", + "\n", + "Overall, the second summary cannot be considered better. Although certain dimensions improved, seems like model show improvment in tone but has to sacrifies with clarity and flow.\n", + "\n", + "This outcome demonstrates that while a feedback loop can enhance specific aspects of performance, it does not guarantee overall improvement across all evaluation dimensions." + ] }, { "cell_type": "markdown", @@ -234,7 +986,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "deploying-ai-env (3.12.12)", "language": "python", "name": "python3" }, @@ -248,7 +1000,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.12" } }, "nbformat": 4,