diff --git a/Agentic-langGraph-RAG/Agentic_PDF_RAG.ipynb b/Agentic-langGraph-RAG/Agentic_PDF_RAG.ipynb new file mode 100644 index 0000000..83108de --- /dev/null +++ b/Agentic-langGraph-RAG/Agentic_PDF_RAG.ipynb @@ -0,0 +1,1240 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install pypdfium2 backoff langchain-community langchain langchain-openai langgraph -q\n", + "!pip install qdrant-client -q\n" + ], + "metadata": { + "id": "1I9o4hl4aytx" + }, + "id": "1I9o4hl4aytx", + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bea12ac1", + "metadata": { + "id": "bea12ac1" + }, + "outputs": [], + "source": [ + "from io import BytesIO\n", + "import pypdfium2 as pdfium\n", + "import backoff\n", + "import asyncio\n", + "import json\n", + "import os\n", + "import base64\n", + "from PIL import Image\n", + "import operator\n", + "\n", + "from typing import Annotated, Sequence, TypedDict, Literal\n", + "\n", + "from openai import OpenAIError\n", + "from openai import AsyncOpenAI, OpenAI\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "from langchain import hub\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_community.vectorstores import Chroma\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "from langchain_core.messages import AnyMessage, BaseMessage, HumanMessage, SystemMessage\n", + "from langchain.tools.retriever import create_retriever_tool\n", + "from langchain_community.vectorstores import FAISS\n", + "\n", + "from langgraph.graph.message import add_messages\n", + "from langgraph.graph import START, END, StateGraph, MessagesState\n", + "from langgraph.prebuilt import tools_condition, ToolNode\n", + "\n", + "\n", + "from IPython.display import Image, display\n", + "from pydantic import BaseModel, Field" + ] + }, + { + "cell_type": "code", + "source": [ + "# if you are working in colab\n", + "from google.colab import userdata\n", + "OPENAI_API_KEY= userdata.get('OPENAI_API_KEY')" + ], + "metadata": { + "id": "x0Vp1uqWbVCx" + }, + "id": "x0Vp1uqWbVCx", + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "26b6fe1d", + "metadata": { + "id": "26b6fe1d" + }, + "outputs": [], + "source": [ + "MODEL = \"gpt-4o-2024-08-06\"\n", + "baseurl = \"https://api.openai.com/v1\"\n", + "apikey = OPENAI_API_KEY\n", + "\n", + "clienta = AsyncOpenAI(api_key=apikey , base_url=baseurl)\n", + "os.environ[\"OPENAI_API_BASE\"] = baseurl\n", + "os.environ[\"OPENAI_API_KEY\"] = apikey" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "984e550a", + "metadata": { + "id": "984e550a" + }, + "outputs": [], + "source": [ + "@backoff.on_exception(backoff.expo, OpenAIError)\n", + "async def parse_page_with_gpt(base64_image: str) -> str:\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"\"\"\n", + "\n", + " You are a helpful assistant that extracts information from images.\n", + "\n", + " \"\"\"\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"Extract information from image into text\"},\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": f\"data:image/jpeg;base64,{base64_image}\",\n", + " \"detail\": \"auto\"\n", + " },\n", + " },\n", + " ],\n", + " }\n", + " ]\n", + " response = await clienta.chat.completions.create(\n", + " model=MODEL,\n", + " messages=messages,\n", + " temperature=0,\n", + " max_tokens=4096,\n", + " )\n", + " return response.choices[0].message.content or \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f156ab10", + "metadata": { + "id": "f156ab10" + }, + "outputs": [], + "source": [ + "async def document_analysis(filename: str) -> str:\n", + " \"\"\"\n", + " Document Understanding\n", + "\n", + " Args:\n", + " filename: pdf filename str\n", + " \"\"\"\n", + "\n", + " pdf = pdfium.PdfDocument(filename)\n", + " images = []\n", + " for i in range(len(pdf)):\n", + " page = pdf[i]\n", + " image = page.render(scale=8).to_pil()\n", + " buffered = BytesIO()\n", + " image.save(buffered, format=\"JPEG\")\n", + " img_byte = buffered.getvalue()\n", + " img_base64 = base64.b64encode(img_byte).decode(\"utf-8\")\n", + " images.append(img_base64)\n", + "\n", + " text_of_pages = await asyncio.gather(*[parse_page_with_gpt(image) for image in images])\n", + "\n", + " results = []\n", + "\n", + " extracted_texts = [doc for doc in text_of_pages]\n", + " # Clean each string in the list and append to json_results\n", + " for text in extracted_texts:\n", + " results.append(text)\n", + "\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8dc79679", + "metadata": { + "id": "8dc79679" + }, + "outputs": [], + "source": [ + "docs_list = await document_analysis(\"/content/stock price LSTM- GNN.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e84dc143", + "metadata": { + "id": "e84dc143" + }, + "outputs": [], + "source": [ + "docs_list" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7a099085", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7a099085", + "outputId": "9302ad64-9e57-40a0-8623-89291f2aa07b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "data has been written to 1a982e27-8794-46c1-bf1e-ad89ec5d3f05.txt\n" + ] + } + ], + "source": [ + "import uuid\n", + "\n", + "output_file_path = f\"{uuid.uuid4()}.txt\"\n", + "\n", + "with open(output_file_path, 'w') as json_file:\n", + " json.dump(docs_list, json_file, indent=2)\n", + "\n", + "print(f\"data has been written to {output_file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "93c487e4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "93c487e4", + "outputId": "601c196c-1a1b-4daa-aff9-48c5e7d61970" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[Document(metadata={'source': '1a982e27-8794-46c1-bf1e-ad89ec5d3f05.txt'}, page_content='[\\n \"**Title:** Stock Price Prediction Using a Hybrid LSTM-GNN Model: Integrating Time-Series and Graph-Based Analysis\\\\n\\\\n**Authors:**\\\\n- Meet Satishbhai Sonani, Department of Computer Science, University of Reading, UK\\\\n- Atta Badii, Department of Computer Science, University of Reading, UK\\\\n- Armin Moin, Department of Computer Science, University of Colorado, Colorado Springs, CO, USA\\\\n\\\\n**Abstract:**\\\\nThis paper presents a novel hybrid model that integrates long-short-term memory (LSTM) networks and Graph Neural Networks (GNNs) to enhance stock market prediction accuracy. The LSTM component captures temporal patterns in stock price data, while the GNN component uses Pearson correlation and association analysis to model inter-stock relational data. The model is trained using an expanding window validation approach. Experiments show that the hybrid model achieves a mean square error (MSE) of 0.00144, a 10.6% improvement over the standalone LSTM model. The hybrid model outperforms traditional benchmarks like linear regression and convolutional neural networks (CNN). This highlights the potential of combining temporal and relational data for real-time trading and financial analysis.\\\\n\\\\n**Keywords:** Stock Market Prediction, LSTM, GNN, Hybrid Models, Time-Series Analysis, Financial Forecasting, Machine Learning\\\\n\\\\n**1. Introduction:**\\\\nThe stock market is a complex system influenced by various factors, making stock price movements unpredictable. Accurate predictions are crucial for investors and policymakers. Traditional statistical methods often fail to capture complex patterns. Recent interest in machine learning techniques, such as LSTM networks, shows promise in improving prediction accuracy by handling time-series data effectively.\",\\n \"**Extracted Information:**\\\\n\\\\n**Problem and Context:**\\\\n- LSTMs have been applied to stock market prediction due to their ability to model the sequential nature of stock prices over time.\\\\n- Stock markets are influenced by historical prices and relationships between different stocks and sectors.\\\\n- Economic downturns can cause correlated declines in stocks within the same sector.\\\\n\\\\n**Graph Neural Networks (GNNs):**\\\\n- GNNs model relational data as graphs, capturing dependencies and interactions between entities.\\\\n- They can learn complex relationships not evident from isolated data points.\\\\n- GNNs represent stocks as nodes and their relationships as edges, capturing both direct and indirect influences.\\\\n\\\\n**Hybrid Model Proposal:**\\\\n- Combines LSTM networks and GNNs to leverage both temporal and relational data for stock market prediction.\\\\n- Aims to improve predictive accuracy in volatile and complex financial markets.\\\\n\\\\n**Literature Review:**\\\\n\\\\n**2.1 Introduction:**\\\\n- Accurate stock prediction is challenging due to market volatility, non-linearity, and sensitivity to various factors.\\\\n- Traditional statistical methods often fail to capture these complexities.\\\\n- Machine learning models like SVM and ANN struggled with the sequential nature of financial data.\\\\n- CNNs excel in capturing spatial patterns but are less effective with temporal dependencies.\\\\n- LSTMs and GNNs have emerged to address these limitations.\\\\n\\\\n**Explainable AI (XAI):**\\\\n- XAI enhances transparency and interpretability in financial forecasting.\\\\n- It is crucial for trust and compliance in financial markets.\\\\n\\\\n**2.2 Data Acquisition and Challenges:**\\\\n- Reliable prediction models require accurate data acquisition.\\\\n- Financial data is often noisy, incomplete, and unpredictable.\\\\n- Robust preprocessing techniques are essential for enhancing data quality.\\\\n- Scaling and normalization improve model accuracy.\\\\n- Managing noise in financial time series enhances model resilience.\\\\n\\\\n**2.3 Machine Learning Approaches in Financial Forecasting:**\\\\n- LSTMs are prominent in financial time-series prediction due to their ability to capture long-term dependencies.\\\\n- They are suitable for modeling the non-linear and volatile nature of stock prices.\",\\n \"**Text Extracted from Image:**\\\\n\\\\ndemonstrating adaptability in recognising complex patterns and robustness in volatile markets (Yeung et al. 2020, Zhao et al. 2023). Comparative studies have shown that LSTM outperforms traditional models such as ARIMA in stock price forecasting, particularly with non-linear time-series data (Shankaran et al. 2022, Jarrah & Derbali 2023).\\\\n\\\\nGNNs are essential for analysing stock relationships by capturing dependencies between stocks, modelling the interconnections that traditional methods overlook. (Shi et al. 2024) developed a graph-based GCN-LSTM model integrating relational data with time-series analysis, achieving more accurate predictions by leveraging GNN to capture stock interconnections. Other studies have emphasised the importance of capturing relational dependencies using Graph Convolutional Networks (GCN), showing that GCN outperforms traditional time-series models by considering both temporal and relational dynamics (Singh et al. 2021, Chen et al. 2018). These findings illustrate that GNNs improve stock market predictions by capturing complex relationships between stocks, and when combined with models such as LSTM, they effectively handle both relational and temporal dynamics.\\\\n\\\\nHybrid models integrating LSTM networks with GNNs leverage the strengths of both methods, enabling simultaneous modelling of temporal sequences and relational data. Cheng et al. (2022) demonstrated significant enhancements in predictive accuracy by combining relational data from GNN and temporal patterns from other models. Shi et al. (2024) found that such hybrid models achieve more accurate predictions by capturing both temporal dynamics and inter-stock relationships. However, these models face challenges such as increased computational demands and risks of overfitting or data leakage if not properly implemented (Tang et al. 2021). Careful implementation and validation are necessary to avoid these pitfalls (Mehta & Sen 2020).\\\\n\\\\nDynamic modelling approaches such as rolling window and expanding window analyses are essential for adapting to evolving patterns in stock market data. Rolling window analysis trains the model on a fixed window of recent data, shifting forward with each new prediction, which is effective for short-term predictions (Matsunaga et al. 2019). Expanding window analysis enlarges the training dataset by adding new data while retaining all past observations, outperforming rolling windows in capturing long-term volatility patterns (Feng et al. 2024). These methods enhance the adaptability of predictive models but can incur biases if not properly managed, necessitating a balanced approach to optimise continuous learning while maintaining historical integrity.\\\\n\\\\n4. Research Challenges\\\\n\\\\nDespite the advancements in stock market prediction, several outstanding research issues persist:\\\\n\\\\n1. Integration of Temporal and Relational Models: While hybrid models combining LSTM and GNN have shown potential, empirical evaluations in real-world conditions remain limited. There is a need for more extensive studies assessing their performance in volatile, real-time trading environments(Chen et al. 2018, Shi et al. 2024).\\\\n\\\\n2. Robust Data Handling: Many models overlook real-time challenges such as noisy data, missing values, and market shifts. Although preprocessing methods have been proposed (Bhanja & Das 2018, Yeung et al. 2020), these need further refinement to better handle the complexities of financial data and enhance model resilience.\\\\n\\\\n3. Explainability: The lack of transparency in LSTM and GNN models poses challenges in interpreting predictions. While Explainable AI (XAI) offers solutions (Kuiper et al. 2022), its application in financial models is still limited. Integrating XAI techniques could improve trust and compliance in AI-driven financial forecasting.\\\\n\\\\n4. Scalability and Efficiency: Hybrid models are often computationally intensive, making real-time application difficult. Future research should focus on optimising these models for better scalability without sacrificing accuracy, possibly through algorithmic innovations or hardware acceleration.\\\\n\\\\n5. Real-Time Adaptation: Although expanding window analysis improves real-time predictions (Feng et al. 2024), models require better strategies for continuous adaptation to new data in fast-changing markets. This includes developing methods to quickly retrain models or update predictions without extensive computational overheads.\\\\n\\\\n3 Methodology\\\\n\\\\n3.1 Data Collection and Preprocessing\\\\n\\\\nThe dataset was obtained from Kaggle via the YFinance API, providing comprehensive historical stock data essential for time-series analysis. Ten prominent stocks representing diverse sectors were selected: Apple Inc. (AAPL), Microsoft Corporation (MSFT), Comcast Corporation (CMCSA), Costco Wholesale Corporation (COST), Qualcomm\",\\n \"**Text Extracted from Image:**\\\\n\\\\nIncorporated (QCOM), Adobe Inc. (ADBE), Starbucks Corporation (SBUX), Intuit Inc. (INTU), Advanced Micro Devices (AMD), and Intel Corporation (INTC). These stocks were chosen due to their significant market capitalisation and influence, ensuring broad applicability of the findings.\\\\n\\\\nThe data spanned from January 1, 2005, to December 31, 2023, encompassing various market conditions and providing a robust dataset for model training and evaluation. Features extracted included daily open, high, low, close, adjusted close prices, and trading volume, offering a detailed view of market activity. Figure 1 shows an example of normalised closing prices for a sample stock, along with its 50-day and 200-day moving averages.\\\\n\\\\nFigure 1: Normalised closing prices of a sample stock with its 50-day and 200-day moving averages.\\\\n\\\\n3.1.1 Feature Engineering\\\\n\\\\nTo enhance predictive capabilities, feature engineering was conducted separately for the LSTM and GNN components. For the LSTM network, the primary input consisted of sequences of closing prices. Structuring the data into sequences enabled the model to capture temporal dependencies effectively. The batch size, a crucial hyperparameter, was optimised during training, with values tested at 11 and 21 to balance computational efficiency and the ability to learn long-term dependencies.\\\\n\\\\nFor the GNN component, a graph representing relationships between stocks was constructed. Each node represented a stock, and edges represented relationships based on Pearson correlation coefficients and association analysis. Pearson correlation captured linear relationships by calculating the correlation coefficients between the daily returns of stock pairs, while association analysis identified non-linear and complex relationships. Combining these methods provided a comprehensive representation of inter-stock relationships, crucial for modelling the interconnected nature of the stock market.\\\\n\\\\n3.1.2 Data Preprocessing\\\\n\\\\nData preprocessing focused on scaling and structuring the data. The raw data was clean, with no missing values or significant outliers. Min-Max Scaling was applied to normalise the data. Specifically, each stock price \\\\\\\\( x \\\\\\\\) was transformed to a normalised value \\\\\\\\( x\\' \\\\\\\\) in the range [0, 1] using\\\\n\\\\n\\\\\\\\[ x\\' = \\\\\\\\frac{x - x_{\\\\\\\\text{min}}}{x_{\\\\\\\\text{max}} - x_{\\\\\\\\text{min}}}, \\\\\\\\]\\\\n\\\\nas shown in Equation 1. This normalisation was essential for the convergence of the neural network models and prevented stocks with higher absolute prices from disproportionately influencing the learning process.\\\\n\\\\nTime-series batches were then created for the LSTM network, each of comprising consecutive daily closing prices. The batch size determined the number of days of historical data instances to be ingested into the model to enable it to make a prediction, impacting its ability to capture dependencies. Outlier detection and removal were omitted to preserve time-series integrity and reflect market realities, as extreme price movements may represent significant events rather than anomalies.\",\\n \"**3.2 Graphical Representation of the Stock Network**\\\\n\\\\n- Stocks were modeled using a graph \\\\\\\\( G = (V, E) \\\\\\\\) where each vertex \\\\\\\\( v \\\\\\\\in V \\\\\\\\) represents a stock, and each edge \\\\\\\\( e \\\\\\\\in E \\\\\\\\) represents a significant relationship between two stocks.\\\\n- Pearson correlation coefficients were calculated between daily returns of stock pairs to quantify linear relationships.\\\\n- Daily return \\\\\\\\( r_t \\\\\\\\) for a stock at time \\\\\\\\( t \\\\\\\\) was computed as:\\\\n\\\\n \\\\\\\\[\\\\n r_t = \\\\\\\\frac{P_t - P_{t-1}}{P_{t-1}}\\\\n \\\\\\\\]\\\\n\\\\n where \\\\\\\\( P_t \\\\\\\\) is the closing price at time \\\\\\\\( t \\\\\\\\).\\\\n\\\\n- Pearson correlation coefficient \\\\\\\\( \\\\\\\\rho_{ij} \\\\\\\\) between stocks \\\\\\\\( i \\\\\\\\) and \\\\\\\\( j \\\\\\\\) was computed as:\\\\n\\\\n \\\\\\\\[\\\\n \\\\\\\\rho_{ij} = \\\\\\\\frac{\\\\\\\\sum_{t=1}^{n} (r_{i,t} - \\\\\\\\bar{r_i})(r_{j,t} - \\\\\\\\bar{r_j})}{\\\\\\\\sqrt{\\\\\\\\sum_{t=1}^{n} (r_{i,t} - \\\\\\\\bar{r_i})^2} \\\\\\\\sqrt{\\\\\\\\sum_{t=1}^{n} (r_{j,t} - \\\\\\\\bar{r_j})^2}}\\\\n \\\\\\\\]\\\\n\\\\n- Edges were established between stocks with \\\\\\\\( |\\\\\\\\rho_{ij}| > 0.7 \\\\\\\\), indicating a strong linear relationship.\\\\n- Association analysis was used to capture non-linear relationships using the Apriori algorithm.\\\\n- A lift threshold of 1.7 was set, meaning two stocks appeared 70% more frequently together than if independent.\\\\n- Only rules exceeding this threshold contributed additional edges in the final undirected graph.\\\\n\\\\n**3.3 LSTM Component**\\\\n\\\\n- The LSTM network captured temporal dependencies in stock prices by processing sequences of historical data.\\\\n- Architecture included an input layer for normalized closing price sequences over optimized time windows.\\\\n- Multiple LSTM layers were stacked to learn complex temporal patterns using gating mechanisms.\\\\n- A dense layer transformed LSTM outputs into feature vectors for integration with the GNN component.\\\\n- Hyperparameters such as learning rate, batch size, and number of epochs were optimized through experimentation.\\\\n- Learning rates of 0.001, 0.005, and 0.01 were tested, with batch sizes of 11 and 21.\\\\n- Number of epochs varied between 10 and 50, using early stopping to prevent overfitting.\\\\n- The Adam optimizer was used for efficient training, and Mean Squared Error (MSE) served as the loss function.\\\\n\\\\n**3.4 GNN Component**\\\\n\\\\n- The GNN component modeled relational dependencies among stocks based on the constructed graph.\\\\n- Architecture included an input layer that received the stock graph and initial node features.\",\\n \"3.5 Hybrid Model Integration\\\\n\\\\nThe hybrid model integrated outputs from the LSTM and GNN components to leverage both temporal and relational information. Temporal embeddings from the LSTM captured historical price patterns, while relational embeddings from the GNN encapsulated inter-stock relationships. These embeddings were concatenated to form a unified feature vector, which was then passed through additional dense layers to learn complex interactions. A final dense layer with a linear activation function produced the predicted closing price. Hidden layers utilized ReLU activations to capture non-linear relationships, and the output layer employed a linear activation suitable for regression tasks. The model was trained using the MSE loss function with the Adam optimizer.\\\\n\\\\n3.6 Training Strategy\\\\n\\\\nTo evaluate the performance of the model in a manner that reflects real-world trading scenarios, an expanding window validation strategy was implemented. Baseline models (including linear models, CNNs, dense neural networks, and standalone LSTM models) were first trained to provide a comparative benchmark. The overall training period was set to two years, with 50 days reserved for testing. In this setup, the model was tested on one day at a time; after each test, the data from the corresponding day was added to the training set, effectively expanding the training window. This iterative process, illustrated in Figure 3, ensured that the model consistently incorporated new information from the most recent market conditions, reflecting a dynamic and adaptive learning process. The model was retrained at each step with the updated dataset, enabling it to learn from evolving market trends and continuously improve its predictions. This strategy enhanced model accuracy by ensuring that only historical data was used for training and enhanced the robustness of the model.\\\\n\\\\n3.7 Training Parameters\\\\n\\\\n- Early Stopping: Implemented based on validation loss to prevent overfitting, with a patience parameter set to halt training if no improvement was observed over several epochs.\\\\n- Number of Epochs: Varied between 10 and 50, with the optimal number determined through experimentation.\\\\n- Batch Size: Maintained consistency with the batch sizes used during model input preparation.\\\\n\\\\n4 Experiments and Results\\\\n\\\\n4.1 Experiment Setup\\\\n\\\\nThe experiments were conducted on a high-performance computing platform equipped with an NVIDIA GTX 1080 GPU (8 GB VRAM), 16 GB of RAM, and a multi-core Intel i7 processor. This hardware configuration was essential for...\",\\n \"The document discusses the implementation and evaluation of a hybrid LSTM-GNN model for stock price prediction. Key points include:\\\\n\\\\n- **Software Environment**: Utilized Windows OS, Python 3.8, PyTorch, PyTorch Geometric, NumPy, Pandas, scikit-learn, NetworkX, and Matplotlib.\\\\n\\\\n- **Hyperparameter Tuning**: \\\\n - Learning rates tested: 0.001, 0.005, 0.01; 0.005 was optimal.\\\\n - Batch sizes: 11 and 21; 11 was selected.\\\\n - Number of epochs: 10 to 50; optimal between 40 and 50.\\\\n\\\\n- **Model Features**:\\\\n - Patience parameter and minimum delta for early stopping.\\\\n - Adam optimizer for adaptive learning.\\\\n - ReLU activation in hidden layers, linear activation in output layer.\\\\n - Dropout layers with a rate of 0.5.\\\\n\\\\n- **Evaluation Metric**: Mean Squared Error (MSE) defined as:\\\\n\\\\n \\\\\\\\[\\\\n \\\\\\\\text{MSE} = \\\\\\\\frac{1}{n} \\\\\\\\sum_{i=1}^{n} (\\\\\\\\hat{y}_i - y_i)^2\\\\n \\\\\\\\]\\\\n\\\\n where \\\\\\\\( n \\\\\\\\) is the number of predictions, \\\\\\\\( \\\\\\\\hat{y}_i \\\\\\\\) is the predicted stock price, and \\\\\\\\( y_i \\\\\\\\) is the actual stock price.\\\\n\\\\n- **Performance Analysis**:\\\\n - The model effectively integrated time-series and relational data.\\\\n - Consistently low MSE values were observed, with spikes on November 10, 2022, and November 30, 2022, due to market volatility.\\\\n\\\\n- **Figure 4**: Displays MSE values across all test days using the best parameter configuration.\",\\n \"4.3 Impact of Expanding Window Validation\\\\n\\\\nThe expanding window validation strategy was crucial for adapting the model to changing market conditions. By updating the training set with new data, the model maintained dynamic, real-time applicability. This method balanced retaining historical trends and incorporating recent market dynamics, ensuring predictions reflected current market behavior. It enhanced backtesting accuracy and provided a path for real-world deployment, mitigating overfitting and improving generalization.\\\\n\\\\n4.4 Hyperparameter Tuning\\\\n\\\\nHyperparameter tuning used a grid search to find the optimal combination of learning rates, batch sizes, and epochs. The best configuration had a learning rate of 0.005, 40 epochs, and a batch size of 11, achieving the lowest MSE. Early stopping was applied during training to halt the process if no improvement in validation loss was observed for five consecutive epochs, optimizing both training time and model performance.\\\\n\\\\n4.5 Comparison with Baseline Models\\\\n\\\\nThe hybrid LSTM-GNN model was evaluated against several baseline models, including Linear Regression, CNN, DNN, and a standalone LSTM. The hybrid model outperformed all baselines in terms of MSE.\\\\n\\\\n- Hybrid LSTM-GNN: MSE of 0.00144\\\\n- Linear Regression: MSE of 0.00224\\\\n- Standalone LSTM: MSE of 0.00161\\\\n- CNN: MSE of 0.00302\\\\n- DNN: MSE of 0.00335\\\\n\\\\nThe hybrid model consistently achieved lower MSE values across all stocks, demonstrating robustness and generalizability. It performed well for stocks like CMCSA, AMD, and INTC, while CNN and DNN models had higher MSE values, especially for more volatile stocks.\\\\n\\\\nFigure 5: Comparison of MSE values across different models.\",\\n \"**Heatmap Information:**\\\\n\\\\n- **Title:** MSE Heatmap for Models and Stocks\\\\n- **Models:** Dense, CNN, Middle, LSTM, LSTM+GNN\\\\n- **Stocks:** AAPL, MSFT, CMCSA, COST, QCOM, ADBE, SBUX, NTU, AMD, INTC\\\\n- **Color Scale:** Ranges from 0.001 to 0.007\\\\n\\\\n**Figure Caption:**\\\\nFigure 6: Heatmap of MSE values for different models across individual stocks.\\\\n\\\\n**Text Extract:**\\\\n\\\\n**4.6 Comparative Study**\\\\n\\\\nThe comparative analysis demonstrates the clear advantages of the hybrid LSTM-GNN model over baseline models in predictive accuracy and robustness. While the standalone LSTM captured temporal dependencies effectively, it lacked the ability to model inter-stock relationships significantly influencing market behaviour. Incorporating the GNN component enabled the hybrid model to utilise relational data, capturing complex interactions between stocks and enhancing predictions.\\\\n\\\\nCompared to the standalone LSTM, the hybrid model achieved a 10.6% reduction in average MSE (0.00144 vs 0.00161). This improvement highlights the value of incorporating relational data through the GNN component, which captures both linear and non-linear relationships between stocks. The added contextual information from the GNN enabled the model to leverage insights from stock correlations and broader market trends, improving overall predictive performance.\\\\n\\\\nThe DNN and CNN models, although capable of modelling non-linear relationships, underperformed due to their inability to capture the sequential nature of stock price movements. Stock prices are inherently temporal, and models that do not account for this structure often fail to identify critical patterns. Consequently, the LSTM-based models outperformed the DNN and CNN, reinforcing the importance of temporal modelling in financial forecasting.\\\\n\\\\nLinear Regression, as a simple baseline model, showed limitations in capturing non-linear relationships and temporal dependencies, resulting in an MSE of 0.00224. Although its MSE was lower than that of the CNN and DNN models, it was less effective than the LSTM-based models due to its inability to model the intricate dynamics of stock markets. These limitations were especially evident in more volatile stocks, where Linear Regression struggled with complex market movements.\\\\n\\\\nA key factor driving the hybrid model superior performance was the expanding window training approach. This method progressively increased the training dataset by incorporating new data as it became available, enabling the model to remain up-to-date with recent market trends. Retraining with the most current data enabled the hybrid LSTM-GNN to continuously adapt to changes in market behaviour.\\\\n\\\\nThe expanding window training approach offers several key advantages. Firstly, it enhances adaptability by enabling the model to learn from recent patterns and market anomalies, which improves predictive accuracy in a dynamic environment.\",\\n \"**5 Discussion**\\\\n\\\\n**5.1 Interpretation of Results**\\\\n\\\\nThe hybrid LSTM-GNN model significantly outperformed baseline models in stock price prediction due to its ability to capture both temporal dynamics and relational dependencies. The LSTM component modeled sequential patterns and long-term trends in stock prices, essential in financial time-series data where past events influence future movements. The GNN component captured complex inter-stock relationships by constructing a graph based on Pearson correlation and association analysis, accounting for both linear and non-linear dependencies.\\\\n\\\\nBy integrating temporal and relational embeddings, the hybrid model leveraged the strengths of both approaches, resulting in lower Mean Squared Error (MSE) compared to the standalone LSTM. The expanding window training approach enhanced adaptability by continuously incorporating new data, ensuring the model remained attuned to recent market conditions\\\\u2014a critical factor in the dynamic financial environment. The robust performance of the model across various stocks, including those which are highly volatile such as AMD, indicates its effectiveness in capturing both stable patterns and sudden market shifts.\\\\n\\\\n**5.2 Limitations**\\\\n\\\\nDespite its advantages, the hybrid model has limitations. The increased computational complexity from integrating LSTM and GNN components demands significant processing power and memory, which may not be readily available to all practitioners. The expanding window approach, while improving adaptability, complicates validation since it lacks a separate validation set, increasing the risk of overfitting without proper feedback during training.\\\\n\\\\nThe model performance is sensitive to hyperparameter tuning, requiring extensive experimentation that can be resource-intensive. Data limitations, such as missing values or anomalies, can degrade the model effectiveness. Additionally, assuming that past relationships persist into the future may not hold during unprecedented market events or structural economic changes, reducing the model predictive accuracy.\\\\n\\\\nFrequent retraining due to the expanding window method increases computational load, which can be impractical in real-time applications where swift predictions are essential. This could limit the model applicability in high-frequency trading environments that demand rapid decision-making.\\\\n\\\\n**5.3 Implications for Practice**\\\\n\\\\nThe enhanced predictive accuracy of the hybrid model holds significant implications for real-time trading and financial analysis. It can aid traders and investors in making informed decisions, improving risk management and potentially increasing returns. By accurately forecasting stock prices, the model supports strategies such as algorithmic trading, portfolio optimisation, and risk assessment.\",\\n \"**Conclusion:**\\\\n\\\\nThe study introduces a hybrid model combining Long Short-Term Memory (LSTM) and Graph Neural Network (GNN) for stock price prediction. It integrates temporal and relational data to enhance predictive accuracy. Key findings include:\\\\n\\\\n- The hybrid model outperforms traditional models like linear regression, convolutional neural networks, dense neural networks, and standalone LSTM models.\\\\n- It achieves a notable reduction in Mean Squared Error (MSE), approximately 10.6% lower than standalone LSTM.\\\\n- The GNN component effectively incorporates inter-stock relationships.\\\\n\\\\nThe research demonstrates that integrating temporal dependencies with relational information can improve stock price predictions. The model adapts to evolving market conditions, capturing long-term trends and recent market shifts.\\\\n\\\\nFuture research could explore additional data sources like macroeconomic indicators, news sentiment, and social media trends for a comprehensive understanding of stock prices. Extending the model to other financial instruments could validate its adaptability across different markets.\\\\n\\\\n**References:**\\\\n\\\\n1. Bhanja, S. & Das, A. (2018). Impact of data normalization on deep neural network for time series forecasting. arXiv preprint. [URL](https://arxiv.org/abs/1812.05519)\\\\n\\\\n2. Chen, Y., Wei, Z. & Huang, X. (2018). Incorporating Corporation Relationship via Graph Convolutional Neural Networks for Stock Price Prediction.\\\\n\\\\n3. Cheng, D., Yang, F., Xiang, S. & Liu, J. (2022). Financial time series forecasting with multi-modality graph neural network. Pattern Recognition 121, 108218.\\\\n\\\\n4. Fama, E. F. (1970). Efficient capital markets: A review of theory and empirical work. The Journal of Finance 25(2), 383-417. [URL](http://www.jstor.org/stable/2325486)\\\\n\\\\n5. Feng, Y., Zhang, Y. & Wang, Y. (2024). Out-of-sample volatility prediction: Rolling window, expanding window, or both?. Journal of Forecasting 43(3), 567-582.\\\\n\\\\n6. Fischer, T. & Krauss, C. (2018). Deep learning with long short-term memory networks for financial market predictions. European Journal of Operational Research 270(2), 654-669. [URL](https://www.sciencedirect.com/science/article/pii/S0377221717310652)\",\\n \"Here is the extracted information from the image:\\\\n\\\\n1. Fjellstr\\\\u00f6m, C. (2022). \\\\\"Long short-term memory neural network for financial time series\\\\\". URL: https://arxiv.org/abs/2201.08218\\\\n\\\\n2. Gandhmal, D. P. & Kannan, K. (2019). \\\\\"Systematic analysis and review of stock market prediction techniques\\\\\", Computer Science Review 34. URL: https://api.semanticscholar.org/CorpusID:202771464\\\\n\\\\n3. Hadavandi, E., Ghanbari, A. & Abbasian-Naghneh, S. (2010). \\\\\"Developing a time series model based on particle swarm optimization for gold price forecasting\\\\\", 2012 Fifth International Conference on Business Intelligence and Financial Engineering 0, 337\\\\u2013340.\\\\n\\\\n4. Hiremath, G. S. & Kamaiah, B. (2010). \\\\\"Nonlinear dependence in stock returns: Evidences from india\\\\\", Journal of Quantitative Economics 8(1), 69\\\\u201385. URL: https://ideas.repec.org/a/jqe/jqenew/v8y2010i1p69-85.html\\\\n\\\\n5. Hochreiter, S. & Schmidhuber, J. (1997). \\\\\"Long short-term memory\\\\\", Neural Computation 9(8), 1735\\\\u20131780. URL: https://doi.org/10.1162/neco.1997.9.8.1735\\\\n\\\\n6. Jarrah, M. & Derbali, M. (2023). \\\\\"Predicting saudi stock market index by using multivariate time series based on deep learning\\\\\", Applied Sciences 13(14).\\\\n\\\\n7. Kipf, T. & Welling, M. (2016). \\\\\"Semi-supervised classification with graph convolutional networks\\\\\", ArXiv abs/1609.02907. URL: https://api.semanticscholar.org/CorpusID:31444218\\\\n\\\\n8. Kuiper, O., van den Berg, M., van der Burgt, J. & Leijnen, S. (2022). Exploring explainable ai in the financial sector: Perspectives of banks and supervisory authorities, in \\\\\"Explainable Artificial Intelligence: Concepts, Techniques, and Applications\\\\\", pp. 105\\\\u2013119.\\\\n\\\\n9. Kullmann, L., Kert\\\\u00e9sz, J. & Kaski, K. (2002). \\\\\"Time-dependent cross-correlations between different stock returns: A directed network of influence\\\\\", Physical Review E 66(2), 026125. URL: https://link.aps.org/doi/10.1103/PhysRevE.66.026125\\\\n\\\\n10. Malkiel, B. G. (2003). \\\\\"The efficient market hypothesis and its critics\\\\\", Journal of Economic Perspectives 17(1), 59\\\\u201382. URL: https://www.aeaweb.org/articles?id=10.1257/089533003321164958\\\\n\\\\n11. Mantegna, R. N. (1999). \\\\\"Hierarchical structure in financial markets\\\\\", The European Physical Journal B 11(1), 193\\\\u2013197. URL: https://doi.org/10.1007/s100510050929\\\\n\\\\n12. Martinez, J. (2003). \\\\\"The impact of firm-specific attributes on the relevance in earnings and cash-flows: a nonlinear relationship between stock returns and accounting numbers\\\\\", Review of Accounting and Finance 2(1), 16\\\\u201339. URL: https://doi.org/10.1108/eb026999\\\\n\\\\n13. Matsunaga, D., Suzumura, T. & Takahashi, T. (2019). \\\\\"Exploring graph neural networks for stock market predictions with rolling window analysis\\\\\", arXiv preprint . URL: https://arxiv.org/abs/1909.10660\\\\n\\\\n14. Mehtab, S. & Sen, J. (2020). Stock price prediction using cnn and lstm-based deep learning models, in \\\\\"2020 International Conference on Decision Aid Sciences and Application (DASA)\\\\\", IEEE, pp. 447\\\\u2013453.\\\\n\\\\n15. Ran, X., Shan, Z., Fan, Y. & Gao, L. (2024). \\\\\"A model based lstm and graph convolutional network for stock trend prediction\\\\\", PeerJ Computer Science 10, e2326.\\\\n\\\\n16. Senapati, M. R., Das, S. & Mishra, S. (2018). \\\\\"A novel model for stock price prediction using hybrid neural network\\\\\", Journal of The Institution of Engineers (India): Series B 99(6), 555\\\\u2013563. URL: https://doi.org/10.1007/s40031-018-0343-7\\\\n\\\\n17. Shankar, P., Sharma, N., Naga Rohith, K. & Ghosh, A. (2022). \\\\\"Stock-price-prediction-using lstm arima\\\\\", Journal of Development Economics and Management Research Studies 9, 55\\\\u201366.\\\\n\\\\n18. Shi, Y., Wang, Y., Qu, Y. & Chen, Z. (2024). \\\\\"Integrated gcn-lstm stock prices movement prediction based on knowledge- incorporated graphs construction\\\\\", International Journal of Machine Learning and Cybernetics 15(1), 161\\\\u2013176.\\\\n\\\\n19. Soni, A., Gupta, P. & Thakur, N. (2021). \\\\\"An empirical research and comprehensive analysis of stock market prediction using machine learning and deep learning techniques\\\\\", IOP Conference Series: Materials Science and Engineering 1022(1), 012098.\\\\n\\\\n20. Tang, Q., Shi, R., Fan, T., Ma, Y. & Huang, J. (2021). \\\\\"Prediction of financial time series based on lstm using wavelet transform and singular spectrum analysis\\\\\", Mathematical Problems in Engineering 2021(1), 9942410.\\\\n\\\\n21. Thakkar, A. & Chaudhari, K. (2021). \\\\\"Fusion in stock market prediction: a decade survey on the necessity, recent developments, and potential future directions\\\\\", Information Fusion 65, 95\\\\u2013107.\",\\n \"Worsnup, C. (2022). \\\\u2018Backtesting cross validation for timeseries [notebook]\\\\u2019. \\\\n*URL:* https://www.kaggle.com/code/cworsnup/backtesting-cross-validation-for-timeseries/notebook\\\\n\\\\nWu, Z., Pan, S., Chen, F., Long, G., Zhang, C. & Yu, P. S. (2021). \\\\u2018A comprehensive survey on graph neural networks\\\\u2019, *IEEE Transactions on Neural Networks and Learning Systems* 32(1), 4\\\\u201324.\\\\n\\\\nYeung, J., Wei, Z., Chan, K. Y., Lau, H. & Yiu, K.-F. (2020). \\\\u2018Jump detection in financial time series using machine learning algorithms\\\\u2019, *Soft Computing* 24.\\\\n\\\\nZhao, C., Hu, P., Liu, X., Lan, X. & Zhang, H. (2023). \\\\u2018Stock market analysis using time series relational models for stock price prediction\\\\u2019, *Mathematics* 11(5), 1130.\"\\n]')]" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "from langchain_community.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(output_file_path)\n", + "data = loader.load()\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b0158e26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b0158e26", + "outputId": "7a3f2243-bb8b-4afa-9314-8777539f5061" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "File 1a982e27-8794-46c1-bf1e-ad89ec5d3f05.txt deleted successfully.\n" + ] + } + ], + "source": [ + "# Check if the file exists\n", + "if os.path.exists(output_file_path):\n", + "\n", + " # Delete the file\n", + " os.remove(output_file_path)\n", + " print(f\"File {output_file_path} deleted successfully.\")\n", + "else:\n", + " print(\"File does not exist.\")" + ] + }, + { + "cell_type": "code", + "source": [ + "from langchain_core.documents import Document\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + "documents = [Document(page_content=text, metadata={\"page\": i})\n", + " for i, text in enumerate(docs_list)]\n", + "\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=400)\n", + "doc_splits = text_splitter.split_documents(documents)\n", + "len(doc_splits)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nWdMKAndRWbl", + "outputId": "0ad24e49-d38c-44af-8874-eb66e5f5179a" + }, + "id": "nWdMKAndRWbl", + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:langchain_text_splitters.base:Created a chunk of size 1077, which is longer than the specified 1000\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "50" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")" + ], + "metadata": { + "id": "7Z3cK37GfrAN" + }, + "id": "7Z3cK37GfrAN", + "execution_count": 14, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import userdata\n", + "QDRANT_API_KEY= userdata.get('QDRANT_API_KEY')\n", + "QDRANT_URL=\"\"\n", + "QDRANT_API_KEY=\"\"" + ], + "metadata": { + "id": "HOVvYwobdlHu" + }, + "id": "HOVvYwobdlHu", + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import qdrant_client\n", + "from qdrant_client import QdrantClient\n", + "from qdrant_client.models import Distance, VectorParams\n", + "\n", + "client = QdrantClient(\n", + " QDRANT_URL,\n", + " api_key = QDRANT_API_KEY\n", + ")\n", + "\n", + "QDRANT_COLLECTION = \"agentic_collection\"\n", + "collection_config = qdrant_client.http.models.VectorParams(\n", + " size=1536, # 768 for instructor-xl, 1536 for OpenAI # 384 for sentence trans= fastembed 768 for mpnet\n", + " distance=qdrant_client.http.models.Distance.COSINE\n", + " )\n", + "client.recreate_collection(\n", + " collection_name = QDRANT_COLLECTION,\n", + " vectors_config=collection_config\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JaY5jXDpQRCO", + "outputId": "74d22154-9ef2-4c4d-d37b-c8a8176980e7" + }, + "id": "JaY5jXDpQRCO", + "execution_count": 39, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/tmp/ipython-input-3990674477.py:17: DeprecationWarning: `recreate_collection` method is deprecated and will be removed in the future. Use `collection_exists` to check collection existence and `create_collection` instead.\n", + " client.recreate_collection(\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from langchain.vectorstores import Qdrant\n", + "\n", + "vectorstore = Qdrant(\n", + " client=client,\n", + " collection_name=QDRANT_COLLECTION,\n", + " embeddings=embedding_model\n", + " )\n", + "texts = [doc.page_content for doc in doc_splits]\n", + "vectorstore.add_texts(texts)\n", + "retriever=vectorstore.as_retriever()" + ], + "metadata": { + "id": "6uT3zoN8QQyW" + }, + "id": "6uT3zoN8QQyW", + "execution_count": 40, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "5c2a4092", + "metadata": { + "id": "5c2a4092" + }, + "outputs": [], + "source": [ + "retriever_tool = create_retriever_tool(\n", + " retriever,\n", + " \"document_understanding\",\n", + " \"Retrieve and provide insights on document content analysis and knowledge extraction\",\n", + ")\n", + "tools = [retriever_tool]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "5b0ffede", + "metadata": { + "id": "5b0ffede" + }, + "outputs": [], + "source": [ + "class AgentState(TypedDict):\n", + " # The add_messages function defines how an update should be processed\n", + " # Default is to replace. add_messages says \"append\"\n", + " messages: Annotated[Sequence[BaseMessage], add_messages]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "33cd3b20", + "metadata": { + "id": "33cd3b20" + }, + "outputs": [], + "source": [ + "### Edges\n", + "def grade_documents(state) -> Literal[\"generate\", \"rewrite\"]:\n", + " \"\"\"\n", + " Determines whether the retrieved documents are relevant to the question.\n", + "\n", + " Args:\n", + " state (messages): The current state\n", + "\n", + " Returns:\n", + " str: A decision for whether the documents are relevant or not\n", + " \"\"\"\n", + "\n", + " print(\"---CHECK RELEVANCE---\")\n", + "\n", + " # Data model\n", + " class grade(BaseModel):\n", + " \"\"\"Binary score for relevance check.\"\"\"\n", + "\n", + " binary_score: str = Field(description=\"Relevance score 'yes' or 'no'\")\n", + "\n", + " # LLM\n", + " model = ChatOpenAI(temperature=0, model=\"gpt-4o\", streaming=True)\n", + "\n", + " # LLM with tool and validation\n", + " llm_with_tool = model.with_structured_output(grade)\n", + "\n", + "\n", + " # Prompt\n", + " prompt = PromptTemplate(\n", + " template=\"\"\"You are a grader assessing relevance of a retrieved document to a user question. \\n\n", + " Here is the retrieved document: \\n\\n {context} \\n\\n\n", + " Here is the user question: {question} \\n\n", + " If the document contains keyword(s) then grade it as relevant. \\n\n", + " Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.\"\"\",\n", + " input_variables=[\"context\", \"question\"],\n", + " )\n", + "\n", + " # Chain\n", + " chain = prompt | llm_with_tool\n", + "\n", + " messages = state[\"messages\"]\n", + " last_message = messages[-1]\n", + "\n", + " question = messages[0].content\n", + " docs = last_message.content\n", + "\n", + " print(\"question: \", question)\n", + " print(\"context: \", docs)\n", + " scored_result = chain.invoke({\"question\": question, \"context\": docs})\n", + "\n", + " score = scored_result.binary_score\n", + "\n", + " if score == \"yes\":\n", + " print(\"---DECISION: DOCS RELEVANT---\")\n", + " return \"generate\"\n", + "\n", + " else:\n", + " print(\"---DECISION: DOCS NOT RELEVANT---\")\n", + " print(score)\n", + " return \"rewrite\"" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "00d5c686", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "00d5c686", + "outputId": "13fa95bb-eaa0-4313-d526-ba2e8fc4b3a1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "********************Prompt[rlm/rag-prompt]********************\n", + "================================\u001b[1m Human Message \u001b[0m=================================\n", + "\n", + "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n", + "Question: \u001b[33;1m\u001b[1;3m{question}\u001b[0m \n", + "Context: \u001b[33;1m\u001b[1;3m{context}\u001b[0m \n", + "Answer:\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/langsmith/client.py:272: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "### Nodes\n", + "def agent(state):\n", + " \"\"\"\n", + " Invokes the agent model to generate a response based on the current state. Given\n", + " the question, it will decide to retrieve using the retriever tool, or simply end.\n", + "\n", + " Args:\n", + " state (messages): The current state\n", + "\n", + " Returns:\n", + " dict: The updated state with the agent response appended to messages\n", + " \"\"\"\n", + " print(\"---CALL AGENT---\")\n", + " messages = state[\"messages\"]\n", + " model = ChatOpenAI(temperature=0, streaming=True, model=\"gpt-4o\")\n", + " model = model.bind_tools(tools)\n", + " response = model.invoke(messages)\n", + " # We return a list, because this will get added to the existing list\n", + " return {\"messages\": [response]}\n", + "\n", + "\n", + "def rewrite(state):\n", + " \"\"\"\n", + " Transform the query to produce a better question.\n", + "\n", + " Args:\n", + " state (messages): The current state\n", + "\n", + " Returns:\n", + " dict: The updated state with re-phrased question\n", + " \"\"\"\n", + "\n", + " print(\"---TRANSFORM QUERY---\")\n", + " messages = state[\"messages\"]\n", + " question = messages[0].content\n", + "\n", + " msg = [\n", + " HumanMessage(\n", + " content=f\"\"\" \\n\n", + " Look at the input and try to reason about the underlying semantic intent / meaning. \\n\n", + " Here is the initial question:\n", + " \\n ------- \\n\n", + " {question}\n", + " \\n ------- \\n\n", + " Formulate an improved question: \"\"\",\n", + " )\n", + " ]\n", + "\n", + " # Grader\n", + " model = ChatOpenAI(temperature=0, model=\"gpt-4o\", streaming=True)\n", + " response = model.invoke(msg)\n", + " return {\"messages\": [response]}\n", + "\n", + "\n", + "def generate(state):\n", + " \"\"\"\n", + " Generate answer\n", + "\n", + " Args:\n", + " state (messages): The current state\n", + "\n", + " Returns:\n", + " dict: The updated state with re-phrased question\n", + " \"\"\"\n", + " print(\"---GENERATE---\")\n", + " messages = state[\"messages\"]\n", + " question = messages[0].content\n", + " last_message = messages[-1]\n", + "\n", + " docs = last_message.content\n", + "\n", + " # Prompt\n", + " prompt = hub.pull(\"rlm/rag-prompt\")\n", + "\n", + " # LLM\n", + " llm = ChatOpenAI(model_name=\"gpt-4o\", temperature=0, streaming=True)\n", + "\n", + " # Post-processing\n", + " def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + " # Chain\n", + " rag_chain = prompt | llm | StrOutputParser()\n", + "\n", + " # Run\n", + " print(\"context: \", docs)\n", + " print(\"question: \", question)\n", + " response = rag_chain.invoke({\"context\": docs, \"question\": question})\n", + " return {\"messages\": [response]}\n", + "\n", + "\n", + "print(\"*\" * 20 + \"Prompt[rlm/rag-prompt]\" + \"*\" * 20)\n", + "prompt = hub.pull(\"rlm/rag-prompt\").pretty_print() # Show what the prompt looks like\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "bd6a1131", + "metadata": { + "id": "bd6a1131" + }, + "outputs": [], + "source": [ + "# Define a new graph\n", + "workflow = StateGraph(AgentState)\n", + "\n", + "# Define the nodes we will cycle between\n", + "workflow.add_node(\"agent\", agent) # agent\n", + "retrieve = ToolNode([retriever_tool])\n", + "workflow.add_node(\"retrieve\", retrieve) # retrieval\n", + "workflow.add_node(\"rewrite\", rewrite) # Re-writing the question\n", + "workflow.add_node(\n", + " \"generate\", generate\n", + ") # Generating a response after we know the documents are relevant\n", + "# Call agent node to decide to retrieve or not\n", + "workflow.add_edge(START, \"agent\")\n", + "\n", + "# Decide whether to retrieve\n", + "workflow.add_conditional_edges(\n", + " \"agent\",\n", + " # Assess agent decision\n", + " tools_condition,\n", + " {\n", + " # Translate the condition outputs to nodes in our graph\n", + " \"tools\": \"retrieve\",\n", + " END: END,\n", + " },\n", + ")\n", + "\n", + "# Edges taken after the `action` node is called.\n", + "workflow.add_conditional_edges(\n", + " \"retrieve\",\n", + " # Assess agent decision\n", + " grade_documents,\n", + ")\n", + "workflow.add_edge(\"generate\", END)\n", + "workflow.add_edge(\"rewrite\", \"agent\")\n", + "\n", + "# Compile\n", + "graph = workflow.compile()\n" + ] + }, + { + "cell_type": "code", + "source": [ + "from IPython.display import Image, display\n", + "\n", + "try:\n", + " display(Image(graph.get_graph(xray=True).draw_mermaid_png()))\n", + "except Exception:\n", + " # This requires some extra dependencies and is optional\n", + " pass" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 473 + }, + "id": "iP5IM0qhcrQx", + "outputId": "edb939ac-698d-4979-fa57-9e771d93b370" + }, + "id": "iP5IM0qhcrQx", + "execution_count": 30, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAARIAAAHICAIAAAAN8PI9AAAAAXNSR0IArs4c6QAAIABJREFUeJzt3XdAE/f7B/BPdiCBsPeSLYiCUlSqCIKCuEedgKvWgXa4bbW2jm9tHbW2WrVqVVLraLWoaFXcuBFFECd7y4aEDDJ+f8QfUoRANLnLJc/rL7Lu3kl4cvfc+BxJLpcjAIAqyHgHAIB4oGwAUBmUDQAqg7IBQGVQNgCoDMoGAJVR8Q6gfuUFIn69pLFe0iSWiwUyvON0jG5AplBILA6FZUSzdmaQ4KdM65F0Zr/N8zRebiYv9zHfxYcllcpZxlQza7pIIMU7V8cYBpTaCjG/XiIWyYueNzp5G3bxZfkEccg6+JumI3ShbB7frr95utK1G8vJi9XFl0Wlk/BO9F7ynzTmPubnP+H79uEEDjLFOw5oA7HLprai6XxCmYU9I3i4BdNQ11Zubp+pSr9WGxVn6+xjiHcW8B8ELpvsR7xbSVUjZtsbm+ns2kyTSH7xSLmlHaNXBCx2tAhRy6Y4W/Doeu2QabZ4B8HCraQqJosSEGqCdxDwGiHLJvNmXf5TwdAZNngHwc6NU1VioTTsIyu8gwBEyP02pbnCZ6kNelUzCKEPh5uTyaSMlDq8gwBEvLIRC2T3zleP/dQB7yA4GDDWsqJYVJorxDsIIFrZpJysdPdn450CN92COddOVOCdAhCqbGormoqzBT69jfEOghsrRwbHnPbyIQ/vIPqOSGWTcaOu/yhLvFPg7MPhFi+gbPBGpLJJv17r3BXTHX9HjhxZvXr1O7xw2bJliYmJGkiEjMyoNa/EVaViTUwcdBJhyibvMd+lqyEJ2+NmHj9+jPELO6OLLyvvMV9z0wcdIsx+m5unqyzsGJ49NbI9ICcnZ9euXampqRQKpXv37rGxsT169Jg5c2Z6erriCVwu19vb+8iRI9evX8/MzGQwGIGBgfHx8XZ2dgihxYsX0+l0GxubgwcPfvfddytWrFC8is1mX7lyRe1pK4rE9y9WR03Vr03wWoUwS5vyAqGhMUUTUxaLxXPmzJFKpbt27fr555/JZPLChQtFItHevXu7des2dOjQ1NRUb2/v+/fvb9y4MSAggMvlbt26tby8fNWqVYop0Gi0rKysly9fbtmyJTAw8MaNGwihVatWaaJmEEJGppSil42amDLoJMIczdVYL2EZaaRs8vPzq6urp02b5u7ujhD67rvvHjx4IJFIGAxGy6f5+/sfOXLExcWFQqEghGJiYhYvXszj8dhsNoVCqaioOHLkiOIlIpFIEzmbMVkUkUAmkyEyYX70dA1xyqZBamikkbROTk6mpqbffPPN2LFje/To4ePjExgY+PbTKBRKYWHh5s2bMzIyBAKB4s7q6mo2m40Q6tKlS6sy0yiWMbWxXsI2IczXp2MI83tFpZPJGlnYIAaD8dtvv/Xr12/v3r1xcXGjR4/+999/337apUuXFi9e3L1797179967d2/r1q2tJqKRcO2gM8kyApyAp7MIUzY0Oolfr6n/FBcXl88///z06dObNm1ydXVduXLl8+fPWz3nxIkTAQEBc+bM8fT0JJFIPB6eO09qK5pYHM38ioBOIEzZGBpRGuslmphybm7uqVOnEEJMJjM0NPT7778nk8lZWVmtnlZXV2dp+WZn6+XLlzURpjNEjTIanUShEvskVkIjTNlYOzEFjRoZT6Ompubbb7/dunVrUVFRTk7O77//LpPJunfvjhBydHTMyspKTU2trq729PS8e/duWlqaRCLhcrlUKhUhVFZW9vYEGQyGlZXV3bt3U1NTJRL1lzq/XuLkBed74ok4ZePMfH6/QRNT7tmz55dffnn27NlRo0aNHz8+PT19165drq6uCKExY8bI5fJ58+a9ePFi/vz5QUFBn3/+ed++fSsrK1evXu3j4zNv3rzk5OS3pzljxow7d+4sWrSoeeOBGmU/4plY0tU+WdB5hNndKZPKdy7PnrfRHe8g+Dv6Y2HoOCsrR0w3QoCWCLO0IVNIPkHGxS/V/+NNLAKe1IBFhZrBF5E2/Pv04Vz569X4Lxzbe8Ly5ctv377d5kNyuZzUzgFta9eu7d+/v/pi/kdERESb7Y3iTkWD9LaLFy8qdqq+7VZSlWt3lrpjAtUQZiVN4ez+Mo8AtnuPto9Mq6qqam8PvUgkam/XipmZGZPJVGvMN0pKStp7SEkkxaFub6urbDq5uyT2S2f1BQTvgmBlU18tuXGycsg0PT2K8fo/lY4ehi6+sBkNZ4TpbRSMzaiePY3O/F6KdxAc3DtfTWeQoWa0AcHKBiHk1p1lYce4cky/TqnPSKmrKBL1HmKGdxCAiLeS1ux5Gq80VzhgrAXeQbDwKKWurkLcf7S+nxCuPYi3tFHw7Mk2saT+s6NYKiFk2XfetX8qa8qhZrQLUZc2CkUvBMmHyn36GAdF6uDaS+bNupunqz4cbuHbV38H69FOxC4bhBCSo7vnqu9frgkINXXpamjjoqlNyZipKhXnPubnZvIsHZjBw8zpTKKuEegw4pcNQgghSZM8I6Uu+xGvtlLs2dMIIcQyohqb0yQSAlxNjUolN9Q2NdZLRUJZ8YtGKp3cpRvLtzfH2JxIO6P1io6UTTMhX1qcLeTVNjU2SBFCaj9F5/r1671796bT1XkkpYERGckRy4jK4lCtnRjG5jQ1Thxogq6VjaZFRUVxuVwLC73YggfaA+vNAKgMygYAlUHZAKAyKBsAVAZlA4DKoGwAUBmUDQAqg7IBQGVQNgCoDMoGAJVB2QCgMigbAFQGZQOAyqBsAFAZlA0AKoOyAUBlUDYAqAzKBgCVQdkAoDIoGwBUBmUDgMqgbABQGZQNACqDslGNsbFxexczBPoDykY19fX1MCAjgLIBQGVQNgCoDMoGAJVB2QCgMigbAFQGZQOAyqBsAFAZlA0AKoOyAUBlUDYAqAzKBgCVQdkAoDIoGwBUBmUDgMqgbABQGQnOHumMgIAAMplMIpHkcrlcLlf80a1bt4MHD+IdDeAAljadYmNjozipk0QiKerH1NR01qxZeOcC+ICy6ZS+ffu2Wiy7ubn1798fv0QAT1A2nTJt2jRra+vmmyYmJnFxcbgmAniCsukUJyen4ODg5pseHh79+vXDNRHAE5RNZ8XExNjZ2SGEOBzOlClT8I4D8ARl01kuLi7BwcFyudzT0xMWNXqOincANZPL0atCUc0rsVgoVfvE+/iMz/eWD+o96FFKrdonTqOTjc1plnYMugH8lmk7ndpvU5orvHG6SiKW2buxRBooG41iGlJKcwU0OsmzJ9untzHecYAyulM2r4rEV469GhRjT6UTe9TMy4dLvYOMPQNYeAcB7dKR9QEBT3pyZ/GQGQ5ErxmEUNhE20fXawufC/AOAtqlI2WTmlzzQaQF3inUJnCwxcOr6m+fgLroSNmU5gqMzGh4p1AbE0t60YtGvFOAdulI2YhFcpax7mwVpFBJhkZUAV+GdxDQNh0pG4lIpiubNl5rEksR0q23pEN0pGwAwBKUDQAqg7IBQGVQNgCoDMoGAJVB2QCgMigbAFQGZQOAyqBsAFAZlA0AKoOyAUBlUDYAqAzKRuNycl5OnDwM7xRAnaBsNO7J00y8IwA1051zVFR1/MSR27evP3mSSWcwAvwDZ86Mt7WxUzyUePKvY8e49Q31ffv2nzFt7sTJw75e9V1Y6CCE0JmziadOH8/Ly3Z19QgLHTR2zCTF2NCrvl5Mo9GCgoJ37NgiEAp8fbvP/uSzrt6+e/Zu/+PQ7wihsPDArVt29+jRE+/3DdRAT5c2Dx/e//mXjX5+ATt3cv+3fuurivL/fbdK8dDjx4+2/rQhPDwq4cDx/h+Gfbt2OUKIQqEghC5cOLNx01pvL59D3JPTp8059tcf23dsUbyKTqenpt6+dev6zp3cs0kpdBr9+x++QQh9PDN+4oQ4a2ubyxdToWZ0hp6WjZ+f/749RyZPmmZv5+Dl2XX8RzGZmek8Hg8hdO78aXNzi6lxn3A4Jv36hfbqGdT8qlNJx7t3D/js02WmpmaBvXrPmDb3n8SjdXW1CCEymYwQWrb0GztbeyqVGho6KD8/t7ERTmzWTXpaNhQKpbi4cNnyBdHD+oeFB676ejFCqLa2GiGUl5/j69NdUQYIof79Byr+kEgkWVkZHwT2bZ5IQMAHUqk0I+Oh4qajk4uhoaHibzbbCCHU0FCP+TsDWNDT3uba9Uurv1kaF/vxnNmfu7l53LlzY8VXnyse4vN5trb2zc80N3s9II5QKJRKpXv37di7b0fLSdXUViv+aK40oPP0tGySkk507x4wfdocxU0en9f8EIPBlEokzTerqisVf7DZbCaTGRU5PCQkvOWk7O0csUoNtIWelk19fZ2dnUPzzZSUy81/29rY5eXnNN+8ceNK89+urh4CoSDAP1BxUywWl5eXWlm9ue4N0BN6ul7h5uZ5P+1uenqaRCI5eoxLpVIRQuWvyhBCffuGZGe/OHI0QS6X30u93dy6IIRmz/r02rWLZ84mymSyR48erFm3YtGSuSKRSPm8HBycqqoqb9y4Wltbo/l3BrCgp2Uz6+P5vXoGfbny88FRfauqKpcuWe3t5bN4ybwrV5MHhg0ePWr8nr3bR48ddOKfI7NmLUAI0ag0hFD37gG7fuU+evRg9NhBS5bFN/L569ZuYTAYyufVp3c/v27+K79e9OLlM6zeH9AsHRk6ff+3eVHTHVgcNaxzSiSSvLwcd3dPxc0nTx/Pi5+6b8+RLl3c3n/inXdkU86U5c4GLAqWMwWdpKdLGyUePEydNXvytp9/KCsrzcrK+OmnDX5+/hjXDNByerpJQIkPAvt88fmKc+dPz/h4PJttFNirz5w5n+MdCmgXKJs2jBg+dsTwsXinANoLVtIAUBmUDQAqg7IBQGVQNgCoDMoGAJXBljTtJZfLBQKBSCRqamoSi8VCoVAikXh5eeGdC0DZaLHJkyeLJTwajSaXy6VSqUwmo1AoYrH4/PnzeEfTd1A2Wkoul9NotKKSV63ul8nggp74g95GS5FIpPj4eDMzs5Z3SqXStLQ0/EKB16BstFfv3r0/+ugjJpPZfA+FQjl16hSuoQDSnbLhWNKlTXiHUCsDNpXOoMyaNSssLEwxphRCyNraOi0tLSIi4sCBA1KpFO+M+ktHysbQiFJZKsA7hdrUvhKTEKJQEUJo7dq13t7eivM7kpKSVq9e/ffff9fX1wcHB2/ZsqWiogLvsPpIR8qm6wfGRc/5eKdQm7wsnk8f4+abP/30k6Ojo62treImh8NZsGDBnTt3bGxs4uLivvrqq6dPn+IXVh/pyGlqubm5dQVm5QWi4BFWeGd5X5k3avn1TRETLTv5/HPnziUkJLDZ7Li4uODgYA2nA0hHyua7775zdnaePHly6oWaihIRy5hm5WhAuPdFoZIqikVNQqlIIImMtVH15ampqQkJCaWlpTExMSNGjNBMRvAascumrq6OwWAkJSWNHfv69JjyfFHBcz6/XtpQLeno1e8iNzfXydGJQlX/ucosDsWATbFxYnbpxnrnieTk5HC53GvXrsXExMTGxirG4AVqR+Cy2bhxY2RkpJ+fX/OGJgxERUVxuVwLCwvM5vgO6urqEhISuFzuRx99FBsba2VF+BVXbUPUsrl06VJlZeX48eMxnu+LFy+6dOmiGCBK+/35558JCQkBAQExMTFdu3bFO47uIF7ZbNu27dNPPxWJRB2OtAQUzp8/n5CQwGKxYmNjP/zwQ7zj6AKCbYBeu3atYgUJr5pZsGBBbW0tLrN+Z4MHD05ISPj444+PHj360UcfnTx5Eu9EhEeYpc3Zs2eHDBlSV1fH4XBwjEGI3kaJ3NxcLpd75coVxTYDoqxtahtiLG1GjhxpZGSk2NOHb5Lt27ebmJjgm+F9dOnSZdWqVcePH29sbOzXr9/mzZvLy8vxDkU82r60efHihYeHR3Fxsb29fSeeDlTz559/crncHj16xMTE+Pj44B2HMLR3acPn88eOHatYi9CemomPjydcb6PEpEmTkpKSQkNDN2zYMHv27JSUFLwTEYOWLm0UJ5ZYWVk5OzvjneU/iN7bKHH//v2EhISioqK4uDg4zkA5rSubysrK+Pj4Q4cOaece7uzsbGdnZx3upPPy8hISEi5fvqzYZkCj0fBOpI20rmx27NgRGRnp5gZDleOpvr6ey+UmJCSMHTs2NjbW2houffUf2lI2JSUlXC536dKleAfpQHx8/Pr16wm9MU0lhw8f5nK5fn5+sbGxsM2gmbaUTUxMzIYNGxwcHDrxXDzpcG+jxIULFxISEgwMDGJiYvr37493HPzhXDZlZWVZWVkDBw7EMYNKdL63UeL+/ftcLrewsDA2NnbkyJF4x8ETnmVTXl4+c+ZMLperP+s8OiA/P//gwYOXLl2KjY3V220G+Oy3KS8v5/F4TU1Np0+fJlbN6Nh+m3fg7Oy8atWqkydPikSikJCQjRs3lpWV4R0KaziUTWpq6owZMwwMDLS/k3lbdna2RKKRE+CIxcjIaO7cubdu3XJycvr444+XL1/++PFjvENhB9OVtIaGBiMjo0uXLhGomWlFn3sbJZKTkxMSEhgMRkxMTEhICN5xNA67sklKSrp48eKWLVuwmR3AXlpaGpfLLSgoiImJGTVqFN5xNAiLlTSRSIQQKigo0IGamTNnjp73Nkr07Nlzy5YtmzdvzszMDA0N3bt3r1gsxjuURmi8bI4fP644L2ru3LmanhcG8vLyoLdRztnZeeXKladPnxaLxaGhoRs3biwtLcU7lJppcCVNLpcXFhZyudwvv/xSQ7PAXl5enoODA/Q2nXf06NGEhARfX9/Y2FhfX1+846iHpsrm5MmTvXr1MjExYbHeffgioDOSk5O5XC6NRouNjdWBbQYaWUk7c+ZMenq6vb297tUM9DbvJiIiYv/+/fPmzUtMTBw7duyJEyfwTvRe1Ly0SUlJ6devX0FBgZOTkxonq0YCgeB9+tTLly8HBwe/zwAgxsbGWA7spoXy8/O5XO6FCxdiY2NjYmKIOAKROstm06ZNTCZz/vz56pqgJjQ0NCi27L0bqVRKJpPf5//ezMyMTNbek2oxw+fzDx48yOVyR44cGRMTY2dnh3ciFainbF6+fOnu7p6amhoYGKiOVBr0nmXz/qBsWjl69CiXy+3atWtsbGy3bt3wjtMpaiibpUuXDh48OCIiQk2RNOs9y6a2ttbY2Ph9/u+hbNp08eLFhIQEKpUaFxen/dsM3qtsGhoaqqurs7OzCXSwzHuWTVVVlampKZSNhjx8+DAhISE3NzcmJmbMmDF4x2nXu39/X375JY/Hc3Z2JlDNvD8TE5O3G5sJEyYcOnQIp0Q6xd/ff/PmzVu3bn369OmAAQP27NmD7xp1e96xbA4ePBgaGtp8fS9CW79+/blz5zr5ZAqFoufbwTDg5OT05ZdfnjlzRiKRhIeHf//99yUlJXiH+g+Vy+bHH39UnMM8ePBgzUTC2rNnzzr/5NraWplMpsk44DUWizVnzpyUlBRXV9c5c+YsW7YsIyMD71CvqdbbfPLJJ5MnTw4NDdVkJM1q2dtIJJJhw4Yp/maxWH///bdcLj916tS5c+cKCgo4HI6bm9vMmTMV+6AEAsGBAwdu3bpVXV1tZWXl5+c3e/ZsAwMDxUrayJEjJ0+eLJfLT5w4kZycXFJS4ujoGBAQMHXq1FYjV0Fv824uXbqUkJBAoVBiY2MHDBiAb5jOfn/nz59XjMZE6JpphUqlJiYmIoS++OKLv//+WzHWxI4dOwYNGsTlclesWFFWVva///1P8eQdO3ZcvXp19uzZf/75Z1xc3NWrV/ft29dqgomJiQcPHhw9evS+ffuio6PPnTt3/PhxPN6ZDho4cODvv/++YMGCkydPjh49Gt8PtuOyaWpqCg0NVeyN0vlDGE+fPh0SEjJq1CgOh+Pr6zt79uy8vLynT582NDRcvnx5ypQpwcHBbDZ7wIABI0eOTE5ObnU0dEZGhp+f36BBg8zMzIYMGbJly5ZevXrh9250UI8ePTZv3rxt27Znz54NGDDgt99+EwqF2MfooGxqamoEAkFSUhJR9kO9p/z8/JZXHfPy8lJcELO4uFgikXh7ezf3Np6engKBoNVp9D4+PmlpaVu2bLl58yaPx7O3t3d1dcXjfeg4R0fHFStWnDlzRiaTRUREbNiwAeORZJSVzcWLF7lcrrGxse4dkdkmPp/f6iJtitZFIBBUV1cjhJhMZvPX0/xQyymMGjVKMUbHmjVrJk6cuGnTJsULgSawWCzFcO+K43SwnLWylS4+n9/Q0IBhGJwpCqblQr+xsVHRxCt+OIRCoa2traKhVzxkbm7ecgoUCiU6Ojo6Ojo/P//BgwcJCQmNjY1ff/01Hu9Gj7i5uWH8j6qsbEaMGKFXI89TqVQPD48nT54035OVlYUQcnFxsbCwoFAojx8/dnd3Vzz07NkzDodjamra/GS5XJ6cnOzp6en8/+rr65OTk/F4K0CzlK2kyeVynd9HwWAwLCwsHjx4kJ6ertgeff369cTERB6Pl56evnv37l69enXp0sXIyCgsLOzPP/+8ePGiohgU23Na7vokkUjJycnr1q27c+dOQ0PD3bt3b926Bddn1knKljaJiYmZmZkrV67EMA8OJk6cmJCQcPfu3YMHDw4ePLimpubYsWO//vqrtbV1z549Z8yYoXja3Llzd+/evW3bNqlUamdnN2nSpHHjxrWa1KJFi3bu3Ll69WrFqt2QIUPGjh2Lx3sCmqVsd+fJkyczMzN1aSQAtZxv854X3oHdnWq3f/9+Ho+H5Yle0NuoRjsvVgUwpu+9jargmDTQQdkkJiY2H1oCFKRSKd4RAP6UlQ2ZTIa18FZMTEzgMwHQ26gGehsAvY3KoLcB+rjfhs1mv88hdjNmzNizZ4+Zmdk7TwHW8XSAsrLRyd6GRCK9z1nNO3fuNDc3172PBagEehvVEGsUPKAhyn41pVIpXJSilRkzZsAY0EBZ2Zw6dWrDhg0YhiGAkpIS+CkBysqGSqXq/FnQqtq3b1/LkwWAflJWFcOGDWse2AUoQG8DoLdRGfQ2AHoblUFvA6C3URn0NgB6G5VBbwOgt1HZ1KlTa2pq8E4BcAa9jWrKy8vhlBsAvY1qDhw4AL0NgN5GNdbW1nhHAPiD3kY10NsA6G1UBr0N6GAljUql0ul0DMNor6ioKMUI0VQqdfr06WQyWSKRWFlZ/f7773hHAziA3qZTyGRycXFxy3sMDQ0XLVqEXyKAJ2UraRKJRCwWYxhGe/Xu3bvV8KVdunTRq0tkg5aUlc3p06d/+OEHDMNor9jYWCsrq+abLBZr6tSpuCYCeOpgvw30Ngqurq59+vRpvunu7g6LGn2mrGyGDRu2dOlSDMNotZiYGMVOGxaLNWXKFLzjADxBb9NZbm5ugYGBcrnczc0NFjV6TtmWtNOnT2v5OGm8OmllsaixHqN9sgMD40qekqP6R2XdrsdmjgxDioUdnWNBw2Z2oJMIvN/m3wNl5QUiMxsGnYnZqGVG44bNRQgVvsDoot4UGinlZKWJJS0yxobJgsHZtAUh99vI5ejE9mL3AM6Ho2zwzoKF6lLRP78Wj5htZ2gEI1BrBUL2Nqd+K/EOMu3SjY13EIyY2TLCJtge3lyAdxDwGvH225TkCCkUsqOXId5BMGVoTHXvYZxxow7vIAB1UDZ0Op3JZGIYplMqikQMlj6uq7A4tIqid7/qKFAjZb1NdHR0dHQ0hmE6pZEnMTbTxy1LbFPaq4JGvFMA1MHSpqmpSSjEaJNR58mkSCpt9+rWOkwuk4uFcGkdraCsbJKSkjZt2oRhGACIgXi9DQC4I15vAwDuiNfbAIA76G0AUBn0NgCoDHobAFQGvQ0AKoPeBgCVQW8DgMqgtwFAZcqWNmKxuLERjh18Y/jI0D8OwTCcQGnZnDlzZsuWLRiGwV9OzsuJk9s9oXXihKl+3fyxTQS0kbKVNAaDYWioX2eDPXmaqeTRKZOnY5gFaC9lS5shQ4YsXLgQwzCaMnxE6PHjhz/7YlZYeGB9Qz1C6MzZxLnxU4cM7Re/YPpffx9SDFS7Z+/2TZvXlZeXhYUHHvvrj7/+PjRufFTKjSvhg4J+3r6p1UpaRsbDxUvmDR8ROnX6uF93buXz+QihXbu3DR0e0vKSBIePHIwcEqxY121zpoCI9KK3odHpx08cdnf32vjDdkMDwwsXzmzctNbby+cQ9+T0aXOO/fXH9h1bEEIfz4yfOCHO2trm8sXUj8ZNodHoAkHj4SMHVyxfM3rk+JYTLCjIW7p8fpOkafsv+1ev2vDixdNFi+fIZLKwsMGNjY337t1qfub1lMvBfUMMDdudKSAivehtKBSKhaXVgvjFgb16U6nUU0nHu3cP+OzTZaamZoG9es+YNvefxKN1dbVvv6qxsXHmjHkR4VEODk4tH0q+eJZGpa35ZqOTk4urq/uSJV8/e/7k5q1rnh7ednYOKTeuKJ5WVVWZlZUxcGAkQqiTMwWEoKxs2Gy2hYUFhmE0yNOjq+IPiUSSlZXxQWDf5ocCAj6QSqUZGQ/bfKGXp8/bd2Zmpnt7+3I4JoqbtjZ2dnYO6elpCKGI8Khr1y8pVsCuXb9kYGDQt09/VWcKVGJoaMhmYzqMkbJNAhERERERERiG0aDmcRKFQqFUKt27b8fefTtaPqGmtlr5C1vi8RpevHwWFh74nynUVCGEBkVEH0zY8zD9foB/YErK5dABg6hUKo/HU2mmQCWNjY08Hg/LOSorG7FYLJFIdGxjGpvNZjKZUZHDQ0LCW95vb+fY+YmYmVv4GRhMnzan5Z0cYxOEkIODk6ur+/Xrl1xdPR6m39/4w3Z1zRRoD2Vlc+bMGS0fA/rduLp6CISCAP/XywqxWFxeXmplpcIloN1cPS5fPu/foxeJRFLck5eX09z/hIUOPvvvSQd7JzMz8+a5vP9MgfZQ1tvo6n6b2bM+vXbt4plJ2NtyAAAf40lEQVSziTKZ7NGjB2vWrVi0ZK5IJFIsK6qqKm/cuFpYmK9kCuPHx0qkkl92bBYKhQUFeTt3/TTj4wm5edmKR8PCBpeUFJ07fzp0wKDmulIyU0A4erHfppXu3QN2/cp99OjB6LGDliyLb+Tz163doriibZ/e/fy6+a/8etHFS+eUTIFjzNm75wiTwZw9N2bq9HHpj9KWLVnt4e6leNTezsHLs+vzF08V29A6nCkgHJKSnW7a2dvcOFVFppK7BZviHQRrxS8bn92rHTnHDu8gWmf//v08Hm/+/PmYzVEv9tsAoF7KyobJZGK8ORwAQlC2JS0qKioqKgrDMAAQg7KljVAoxHgvEgCEoKxs/v33361bt2IYBgBigN4GAJVBbwOAyqC3AUBl0NsAoDLobQBQGfQ2AKgMehsAVAa9DQAqU1Y2BgYGxsbGGIbpFAM2+f/PYdEvchkyMtXHK8trIWVlExkZ+emnn2IYplNMrejl+fp4+ZCKIoGRKQXvFAB13NvU19djGKZTnLuyGuuaJE16Nzbfq0Khh78R3ikA6ri32bZtG4ZhOoVMRgMnWl86XIJ3EExd/avM70NjEytYSdMKyjZAa2dvgxCydmKEjLY4uOalXz9TM1smw0BZ8ROaVCqvLBKW5jb26M/x7AmLGm2h7KRoLSeXoQdXaiuLRbw6iebmUlJSYmNtTaZg2lTk5+Vb21gzmUxjc5qxKdWzl5GJJSxn2oX9SdHKljZCoVAsFmvnAgchRCKjngNNNDqLV69eTZu2+syZMxqdS1vsf/zxxy/mf4H5fEGnEK+3wZKBgcHu3btxmfUXX3yBEPrtt9+qqqpwCQCUIN5+GywZGRk5ODjgGGDEiBGTJ08m7oq0riLefhssrV27Nj09HccA1tbW586dk8vl+MYArRBvvw2Wrl696uLigncKRCaTzczMoqOjYRRPLQG9TbukUukff/zB4XDwDoIQQo6Ojvv378/Ly2toaMA7C4Depn0UCsXaWouGNreysvLy8mpqalq0aBHeWfQd9Dbt2rdv359//ol3itbMzMxGjhyphcH0irKyaWxsrK3V36vk3b9/383NDe8UbQgJCRk/fjxC6ODBg3hn0VPKyub8+fO//PILhmG0y48//hgUFIR3irZRKBSEEIlE+vnnn/HOoo+UHSXAYrFMTDS7G16btXn5Qa0SGxubn5+PEMrIyPDz88M7jh5RtrQZNGgQlsf5aJXk5OTly5fjnaJjzs7OCKF79+5t374d7yx6BHqbtmVnZ3fv3h3vFJ01Y8YMJycnhBCfz8c7i15QtpJ2/vx5nbx2Z2fMnj0b7wiqGT58OEIoKSmJyWSOGDEC7zg6TtnSRp97m+rqaiIeCTZ+/Pj09HS9XUfADPQ2bSgqKpoxYwaJmCN9rFq1isFgpKenP336FO8sOgt6mzbk5OR8+OGHeKd4dwYGBt26dVu3bl1OTg7eWXQT7LdpQ0hIyJIlS/BO8V4oFAqXy21qapJINHjqq96C3qYNJSUlAoEA7xRq4OXlRSaTP/jgA8XuHaAu0Nu0YdKkSTKZDO8U6kEmk+/du3f79m28g+gU6G1aKy8v79u3L4vFwjuIOk2YMAEh9NVXX8GOHbWA3qY1a2vrDRs24J1CI2bNmrVw4UK8U+gCZWXDZrNNTU0xDKMViouLy8vL8U6hES4uLrt27VKcgIh3FmJTVjYRERHx8fEYhtEKa9euLSwsxDuFZtnb2w8fPpyI+3O1hLKy4fP5NTU1GIbRCh4eHp6ennin0Cw/P7/du3dLJBKdWa5ivFqkrGwuXLigh8fVLlq0SB9OBbe1taXRaI8fPz5+/DjeWd7XvXv33N3dsZwj9Dat3b17V38GiBk4cODTp0+FQmJf+OTJkyddu3bFco4EHgNaQ4YOHbpv3z6tGnxD00QiUVpaWt++ffEO8i6Ki4vnzZuXmJiI5Uyht2ktKCiIwWDgnQJTDAbD2dlZMT4B4WC/qOlgafPPP//o7fk2eignJ4fNZhsbGzOZTLyzqODnn3/mcDhxcXFYzhR6m9b0qrdpydXV1crKKiUl5f79+3hnUUFWVhb2SxskB/8VHR1dVlaGdwo8ffLJJzweD+8UnTVgwICGhgaMZwq9TWt62Nu0smvXLrFY/Pz5c7yDdKywsNDU1JTNZmM8X9hv09rq1av183SJlkxNTel0+rJly/AO0oEnT574+PhgP19lZWNkZGRhYYFhGK2gt71NKy4uLpGRkUVFRXgHUebJkyfe3t7Yz1dZ2YSHh8+ZMwfDMFrh22+/1cPTJdo0cOBAS0vL69evV1RU4J2lbVlZWVq3tOHxeJWVlRiG0QrQ27TEYDCCg4Pj4uIaGxvxztIGXHbadFA2ycnJO3fuxDCMVoDephUKhXL27NmKigptO+4zPz/fysrK0NAQ+1lDb9Pa7du3obd5m7OzM4/H06pNRHg1NtDbtGHt2rXQ27TJzc3N0NBQe0bzwGszGvQ2bejbty/0Nu2ZPn06h8N58uQJ3kEQbscHINTBGNDJycn6c0za4MGD6XS6YiROxTgvcrnczMwsISEB72jaxcTEhE6nh4SEXLx4kUajKe6Mjo4ODg7G+F8Fx6WNsrLRq96GRqOVlZW1vIfBYOjhOmpnGBoa/vvvv5mZmZ6enoohfsrLy+/fv19dXW1mZoZNhtzcXFtbW7zWC6C3eS0wMLDV2GhdunRRDOMP3mZoaBgQEFBeXn7q1KmgoCASiVRaWorlyB44Lmqgt3ljypQpNjY2zTcNDQ1jYmJwTUQArq6ua9asUfzcNDU1JSUlYTZrHBsb2G/zhqenZ69evZpvurq6RkVF4ZqIAMLDw5vP1yKRSOXl5Tdu3MBm1njt6FSA/TZvxMbGKhY4LBZr4sSJeMfRduHh4a221FdXV2M2oIf2lo1e9TaKBU5AQIDiKEZY1HQoLCzM19fXwcGBwWDIZDK5XE4mk3NycnJzczU96+zsbCcnJxyvSazspOj6+nqRSGRpaYlloCaxvKZMzKvH5/ISJSUlu3fvHj58eMsVNiwxDSkWdnQ6U9nPmfaQiOXpqdkvnhTk5OYWFRXx+fyGhoY+ffqMGzdOo/O9d+9edna2JtYIDI0o5rYMGr2DK4Jp11gCt85UvXjAozPJxqZ0iURHxvxXCYmESnIErt1Yg6Zo+9A5t89Wv3jQQKOTjc1ef1lSmVQqkWKwEJDJ5SSENHG5OwFPyq+TeASw+49S1p4o22/D4XCwXNRcPlZBpVNGz3fGbI5aKz+Lf+ynorHz7ckULb0Q4pVjFRQaZVS8bn5Zj2/X/nuwPCqu3V8ubRknLSWxEpEpPUL0bsSP9pTlCjJSqsfMt8c7SBv04ct6ereurlIUMcmqzUeVrUPX19djc35SQ7Wkolis21+Dqmy6GBhb0HMytO5yNHryZXkHcYSNsorCto+FV1Y2ly5dUlzXQdOqysQkYvTAmGIaUiqKtO4UBv35sqg0UlWZuM2HlH0AmPU2vDqJqRUcdNyaiSVdwNe67SL8en35sjgWdF5d21t0lW0SCAsLCwsL01iqN2RSWZNY6/4/cCeRyMVCKd4pWpNK9OXLkjTJKZS2H9KK3gYAYtGK3gYAYtGK3gYAYtGK3gYAYoHeBgCVQW8DgMqUlY2JiYleXYsPgE5S1tuEhoaGhoZiGAYAYlC2tKmrq2s1mAsAoIOyuXz58p49ezAMAwAxQG8DgMqgtwHEtvLrRWKR6Ifvf8FyptDbqNM33y47czYR7xT6JXTAoPCBr8dLwezzh95GnZ4+e4x3BL0TER4VGTlM8Tdmnz9Re5uqqsqly+YPHR4yN37quXOn9+zdPn3meMVDEonk151bp04fFz2s/7IVn96+naK4/+XL52HhgfdSb6/8elFYeOCESUN37vqp+ZzwysqKNWtXTJg0dMSogeu/W1VY+PpyFH/9fWjc+KiUG1fCBwX9vH0TQig3N/unbd/HTRsbFf3h7Dkxp5NOKGYaFh5YXl62cdPa4SNfr9meOZs4N37qkKH94hdM/+vvQ1py/jn2ho8IPX788GdfzAoLD6xvqEcIZWQ8XLxk3vARoVOnj/t151Y+n48QWv/dqiVL45tfNXX6uHHj3wy79c23y75atfDFy2dh4YG3b6eMGx/18SeTFCtpS5fNx/jzV1Y2oaGhs2bNUtec1OuHjd8WFuZv3rTz29U/3Lh59fadFMr/nxvx49bvjp84PHbMpD8PnQ7pP3D1t0uvXb+EEFKMqLJ5y7qI8CHn/721fNm3R44mXL5yQfFPv3DxnIzMh4sXrdq/75ixMSd+/rSS0mKEEI1GFwgaDx85uGL5mtEjxyOEfv5lY+r9Ows///LwodPR0aM2b1l/L/U2lUr998wNhNCSxatOJV5BCF24cGbjprXeXj6HuCenT5tz7K8/tu/YgvfHhg8anX78xGF3d6+NP2w3NDAsKMhbunx+k6Rp+y/7V6/a8OLF00WL58hksl49gzIyH0qlUoRQdXVVSUmRSCgsLnl9zd30R2m9evam0+gIoT37tk8YH7to4ZsxlTD+/AnZ21RXV929d2vixKneXj5WVtaLFn5VVlaieEgoFJ6/kDR50rQRw8dyjDlDo0cNDIvkcvcihMhkMkJoaPTo0AERNBotwD/Q2trm6dPHiq+ksDB/xfI1HwT2MTMznz9vkZEx5/jxw4pL8DU2Ns6cMS8iPMrBwQkhtHr19xu/3+7v38vExHTkiHEe7l537958O+SppOPduwd89ukyU1OzwF69Z0yb+0/i0bo6fbzgFIVCsbC0WhC/OLBXbyqVmnzxLI1KW/PNRicnF1dX9yVLvn72/MnNW9d6BgSJRKLnL54qvhFvb19Pz66ZGQ8RQnl5ObW1NYG9eit+HD8MHvDRuCldvX2VzFSjn7+ysklJSTl8+LBaZqNe+QW5CCG/bv6KmxyOib9/oOLvp08fSySSDwL7Nj85wD/wxctnitUAhJCn55sRUNlsIx6vQbHOQKPRegZ8oLifRCL59+iVkfGg+Zlenm8Gt5fLZMf+/iN26piw8MCw8MAXL5/V1la3SiiRSLKyMv4TI+ADqVSamZmu1k+CMDw93nzsmZnp3t6+HM7rC6Ta2tjZ2Tmkp6dZWVk7OjpnZj5ECGVkPuzq3a1btx6Zj9MVVWRlZe3k5PL21NrU3uefkfFQLW9H2QZoNpvNZrPVMhv1auTzEUJMA4Pme0xNzBQLHB6/ASG04LOZrV5SXV2pGI1OscxphcdraGpqCgsPbHmnufmbAeaah8yTSqXLli+Qy+WfzFrg7x9oxDaaN3/a2xMUCoVSqXTvvh179+1oeX9tXc27vmliaznmII/XoGhRWj6hpqZK8Rv36NGDj8ZNSU+/P33aHAaD+cv2TQihhw9TA/w/eDO1ji5r097nX/PWD9y7UVY2AwYMGDBggFpmo16KT00qeTM8QvPHYWZmgRBatPAre3vHli+xsLCqqmr3JAhzcwsDA4P1635seSeV0saH8+xZ1vMXTzdv+rV50aRYXrXCZrOZTGZU5PCQkPCW99vbOb79ZH1jZm7hZ2Awfdp/hhfnGJsghHr2DNq8ZX1dXW1OzsueAUEUCqWwML+urvZ+2t1PFyzt/Cw0/fkrK5va2lqBQGBra6uWOamRrY0dQig3L9vR0VlxHZ60tLt2dg4IIUdHZzqdTqFQAv5/ta26uopEIhm0WDS9zdXVQyAQ2NjYKaaMECouKTIzNX/7mYqVYwvz1ye95uS8LCzM9/JsY53B1dVDIBQ0xxCLxeXlpVZWWrplEkturh6XL5/379GreTTavLwcRd8YEPABj9dw7vxpNzcPxZXTPdy9zpxNbGioD+zVW6W5aPTzV9bbXLlyZe/evWqZjXo5ODg5OjrvP7CrpLSYx+Nt/ek7W9vXo1casY2mTZ29/8CujIyHYrH4ytXkJcvif9r2vfIJ9g4KDgoK3rhxTXl5WV1d7fETR+bOizv778m3n+nSxY1EIh376w8ej5efn7vj1y0fBPYpKy9VXLTQ0tIqLe3ug4epEolk9qxPr127eOZsokwme/TowZp1KxYtmQuXbkcIjR8fK5FKftmxWSgUFhTk7dz104yPJ+TmZSOEjI2MPT28T578q5tvD8WTu/n5nz593NPD28SkgwENsfz8lZWNqalpywuMaZVlS1bLZLKY2FFfLPzEy8unm28PGvX15VcnTZy6eNGqQ4f3Dx8Zuu3nH+ztHJcs/rrDCX63fmtISPiadStGjYn4J/FoVOTwMaMnvP00Wxu7r75cl5H5cPjI0JVfL5o5M37EiHGZmekzPp6AEJoyeUbq/Turvl4kEAq6dw/Y9Sv30aMHo8cOWrIsvpHPX7d2C1yDGiHEMebs3XOEyWDOnhszdfq49Edpy5as9nD3Ujzq7x9YXFLk5xeguOnr072ktLh5k49ymH3+WjEG9KOU2ldFTb2HqDDcR11drVAotLZ+XdUrvvqcyWCu/nqDxjLiIPtRw6v8xsEx2rVe9w5fFkE9vFLNYKKgyDYu4qtsaVNbW1taWqrJYO9u1erFCxfNTkm5UlNTncDde//+nWHDxuAdCugLQvY2CKE132x06eK2c/dPk2NG3Lhx5Zuvv+/VMwjvUEBfKNuSps29jYmJ6fq1enqsCsAdIffbAIAvovY2AOCIqL0NADgi6n4bAHAEvQ0AKlO2tKmpqSkqKsIwDADEoKxsrl69un//fgzDAEAMysrGzMzM3l4bL/ANAL6U9TYhISEhISEYhgGAGKC3AUBlWtHb0JkUGkM/rnWvCjKJxOIoWx3Ahf58WVQaiWnY9qWitaK3MbOml2Q3YjAjYikvEHDMta5s9OfLKssTmFjS2nxIWdmEhIRMnz5dY6nesHJk0JlkIV+KwbwIpK5S3MVH64ZA0ZMvSyqRi4VSBw/DNh/Vlt4mdKzl5SNw/NsbV46W+X1ozDJpeyUBX/rwZV08VBIyypLczsev7OzOf/75JzMzc+XKle09Qb1qXjUd+iE/KNKCbUpnGVO14bRT7InFsuoSUW5mQ9BgM1c/Ft5x2lXzqumP7/N7R1mwTegsju58WQKetL6q6eGVqlFz7a0c2z2DWlnZXLt2LTs7G5v1NAWZDKWery4vFIr4MolEhtl8W6quruFwOBQKPl2vsRnN2Jzm25djatX2WrX2kMtR6vnqsgI8vyy1MzCiWjsyeg40pTOV/QNoxVgCWmXo0KH79u3T2jHjgTbQlt4GAALRiv02ABCLsrKxsLBwcHDAMAwAxKBsb1q/fv369euHYRgAiEHZ0qaqqqqwsBDDMAAQg7KyuX79+oEDBzAMAwAxQG8DgMqgtwFAZdDbAKAy6G0AUBn0NgCoDHobAFQGvQ0AKoPeBgCVddDbODk5YRgGAGKA3gYAlSlb2lRWVubl5WEYBgBiUFY2KSkpXC4XwzAAEAP0NgCoDHobAFQGvQ0AKoPeBgCVQW8DgMqgt/mPuro6FxcXU1NTvIMArdbB2JNFRUW//PILVmFwVldXN2bMmO3bt9PpdLyzAK3WQdk4ODhERkb+8MMPWOXBjVAoHDp06MWLF/EOAggABrNFCCGZTNanT5+7d+/iHQQQQ2cHCE9LS/vxxx81HAY3wcHBN2/exDsFIAwVljaZmZlZWVnjx4/XcCSsDRgw4MyZMyyW9l4VA2gbfV9JGzRo0NGjR2HTGVCJyldxSUpK0pm1tWHDhnG5XKgZoKp3Wdo8fvy4rq4uODhYM5EwMmbMmK1bt8L+XPAO3nElTSwWy+VyBqPdq7RpuYkTJ65bt87d3R3vIICQ3vFSe3Q6ncvl/vrrr+rOg4WpU6euWrUKaga8s/faJPDs2TMKhUKs/79Zs2bNmzcvICAA7yCAwN53S1pFRQWVSiVKVx0fHx8XF9e7d2+8gwBie9/rIVtaWu7evfvYsWNqyqNBCxcunDBhAtQMeH/q2W+Tn5/PZrPNzc3VEUkjli9fHhERERERgXcQoAved2mj4OzsXFZWVlZWppapqd3q1atDQkKgZoC6qKdsEEK+vr5bt25NTk5W1wTVZf369f7+/tHR0XgHAbpDzQfX1NXVMZnM5v05ERERGBfS999/f/bs2StXrihubty40cnJacKECVhmADpPbUsbBQ6Hk5KSolhb69u3b3V19YYNG9Q7C+UePnzY0NAQGRmJENq2bZu1tTXUDFA7NZcNQig8PHz16tV9+vRpamoikUh37txR+yzak56eXl1dTSKRqqqqgoODDQ0N4+LiMJs70B/qLxuE0KNHjyQSCUKIRCLx+fzU1FRNzOVtN27cqKysVPwtFosTEhKwmS/QN+ovm6CgoKampuabVVVV165dU/tc2nTnzp2WrRqfzw8LC8Nm1kCvqLlsRo8ebWpq2vJ/Vy6X3759W71zadPz58+rqqrIZHLzfOVyOYvFmjJlCgZzB3pF2YBP7+DEiRMpKSnXrl1LS0urqampqakhk8l1dXWPHz/29fVV77xauXXrVnl5uaJgzM3NjY2NBw4c2L9/fz8/P43OF+gh9WyAbmyQ8uslTUK5HL2emkAgePz48f379wsLCysrKwcPHjxmzJj3n5ES//vf/0pLS42Njbt169ajRw8fH5/mh0gkEtOQzDKmMgw10ssBffPuZVOaI3yRzisvEJfnN9KZFJoBhcakyiWyVk+Ty+VNEgmdRlNH2g5IJFIqlfL2/QwWlVctEgulcpnc1JrhEcB282OZWGIRCeikdymbzJv1Wfd4Ap6UZcYytmbTDdr4T9VOchkS1AvrXzXyqxtNrel9Ik3s3AzwDgWIR7WyyclsvPLXKwOOgZWbGYVG7BUeQZ3oVXY125gcPd3agE2YygfaQIWyuZlUXZQrNbE1phuqeUMCjhoqBVV51QPHWzh5GeKdBRBGZ8vm5O5SsYRm0YUYp6OpquBhae9IE6+ebLyDAGLoVNkk/1lZW0u2cOFgEgkfJY8r/EPYPkFQOaBjHfcn105U1jfoeM0ghOx8Le9fqi143oh3EEAAHZTN09SG8mKZmaOO14yCYw/by0crGxukeAcB2q6Dskk+VG7pqr2nOqudmZPpuYRyvFMAbaf02p0nq2w8TBEJwzh4M7I0rK+VluYI8Q4CtFq7ZSMSyHIyGy1cTLDNgz8rV/M752vxTgG0Wrtl8/RePZ2lvWPVpj06t3hV78bGerVP2YDDqCwR1lU2deK5QE+1WzYv0/lscz3dA8i2MMzJ5OGdAmivtstG0iQvLxCyzfX0eC0jC9aLB3y8UwDt1fZhMhWFIpapBq+WnJP/8MLlPYXFT4zZFl29PhwUOpPJZCGErt86fOnawamTNhw9sf5VZZ6ttXvIh5M/CBiqeNXpf39OTT/DoBsGdI+0MHPQXDwDY0bBQ5Hmpg+Iru2lDb9eQqVr6sCz8oq8PQc+k0okCz7ZGzthfXHJ052/x8tkMoQQlUJvFNT/k7RlwpiVG9fc9vMJPfbP+tq6Vwihm3f/vnn3rzFDl3w2+3dTE5uLV3/XUDyEEJlCQiTUJGp9EgQACu2WDYWmqYOCH6Sfo1BoUydtsLZ0sbVxHz96ZVHJk6xn1xFCJDJZKm0aEf25s6MfiUTq5R8tk0mLSp4ihFJuHe3uG96920BDQ+PevUa4umj2kgEMAyqvDvZ7gra1XTYyKaIxNHUWV15BuqODD4v1etO2mamduZlDTt6D5ic42b8+fdqAaYQQEggb5HJ5ZXWhtVWX5uc42HfVUDwFlilDLISlDWhb22tidCa5SSjQ0CwFQl5x6bPFq/4z8n9DQ1Xz3yRS6z2sQhFfJpMymW+Os6TTmBqKp1BfKTQ0gpNwQNvaLhtDY4q0SVOrKEZG5l3o/pEDP2l5J8tQ2WFvTAaLTKZIJG/adJFYs8dcihslLGPdOa0IqFfb/xlsDpXG0NTJm3Y2Hg8zLrh16dm8VCl7lWNpruzSsyQSydTENq8go3/fiYp7njy7oaF4CCFZk9zCwYAMCxvQjrZrw9KBUVPaKG3SyMr9gA+nSKWSxDM/isXC8oq80//+vPmXyWXl2cpf1aNbRHpm8qPMSwihS9cOFJY80UQ2hfpKvpEJLGpAu9pdpDh3ZTVUaGRFiGXIWTz/EJ3G3Lpz6sZtE3LyH4wfvcrezkv5qyIGTP8gYNjxpI2LV/V+8vzm8MhPEUJyuUYKm1/d6BHA0sSUgW5o9+zOnEf8e5d51p4WmEfCX87doqlfOlFo+nTsN1BFu0sb1+4sfo2gSSDBNg/+aorrXboaQM0AJZStwfcfaXH/SrWNt1Wbj9bUlm3e3vbwygZMY4Gw7WOTba3d4z/e9U5R27b6u0iprI3alkolCCEKpY036OvVf9K4b9qbYNmL6mFrurT3KAAdD8FxfHuJoZUZk93Grk+pVMrn17T5qiaJmEZt+5A2MoXKZqnzHJ76+sr2HmqSimmUNmLQaAwDA6M2X1JbUm9rL+8dZabGhED3dFA2Qr7swLo8rxBnDCPhRlAvqsmvmrzUEe8gQNt1sHOGySIPmW5TmF6KVR7cyOUo524J1AzojE6Nk1aaJzrHrXTpZYNJJBxIxLKSx2XjP7djGsI+TtCxTh0KYOvCCI7mvLxVqNarSmsLfo0o507h+M/toWZAJ6kwBnTNK/HZAxU0FtOyi46MyyERS8tfVLOM0KjZOrsgBZqg8oU6rp2oeny71r6rhaGJAZVB1J9nYYOYX91YVVgfPMyiW9+2t6oB0J53ub6NWChLu1ybebOOzqQaWbPJFAqVQaEyKBQqWTvX4kgkkkQskYikErFUzBfVVzQaGJK79+P49dOL0UaB2r3XRQhfFYqKXgjKC4QNtZLGeqlMjiRibTy1i2PBEAkkLGOqsRnV2onRxZdlZApHaoJ3p55rdwKgV4h9RTQAcAFlA4DKoGwAUBmUDQAqg7IBQGVQNgCoDMoGAJX9H1w0iaiTpoC9AAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "ae730429", + "metadata": { + "id": "ae730429", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "525b0cf5-1315-4941-c341-4e7d178eb83f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "---CALL AGENT---\n", + "\"Output from node 'agent':\"\n", + "'---'\n", + "{ 'messages': [ AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_orodPM8UEjVSn7vR8uGQ2g2B', 'function': {'arguments': '{\"query\": \"graph construction for GNN component in stock analysis\"}', 'name': 'document_understanding'}, 'type': 'function'}, {'index': 1, 'id': 'call_w2YvW3VYPvUg5IWY0C9C1gvn', 'function': {'arguments': '{\"query\": \"metrics used to define relationships between stocks in GNN\"}', 'name': 'document_understanding'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'service_tier': 'default'}, id='run--6d4ebeb9-60b9-4ce5-88a9-a11abeaa01a5-0', tool_calls=[{'name': 'document_understanding', 'args': {'query': 'graph construction for GNN component in stock analysis'}, 'id': 'call_orodPM8UEjVSn7vR8uGQ2g2B', 'type': 'tool_call'}, {'name': 'document_understanding', 'args': {'query': 'metrics used to define relationships between stocks in GNN'}, 'id': 'call_w2YvW3VYPvUg5IWY0C9C1gvn', 'type': 'tool_call'}])]}\n", + "'\\n---\\n'\n", + "---CHECK RELEVANCE---\n", + "question: \n", + " How is the graph constructed for the GNN component, and what metrics are used to define relationships between stocks?\n", + " \n", + "context: For the GNN component, a graph representing relationships between stocks was constructed. Each node represented a stock, and edges represented relationships based on Pearson correlation coefficients and association analysis. Pearson correlation captured linear relationships by calculating the correlation coefficients between the daily returns of stock pairs, while association analysis identified non-linear and complex relationships. Combining these methods provided a comprehensive representation of inter-stock relationships, crucial for modelling the interconnected nature of the stock market.\n", + "\n", + "3.1.2 Data Preprocessing\n", + "\n", + "Data preprocessing focused on scaling and structuring the data. The raw data was clean, with no missing values or significant outliers. Min-Max Scaling was applied to normalise the data. Specifically, each stock price \\( x \\) was transformed to a normalised value \\( x' \\) in the range [0, 1] using\n", + "\n", + "\\[ x' = \\frac{x - x_{\\text{min}}}{x_{\\text{max}} - x_{\\text{min}}}, \\]\n", + "\n", + "GNNs are essential for analysing stock relationships by capturing dependencies between stocks, modelling the interconnections that traditional methods overlook. (Shi et al. 2024) developed a graph-based GCN-LSTM model integrating relational data with time-series analysis, achieving more accurate predictions by leveraging GNN to capture stock interconnections. Other studies have emphasised the importance of capturing relational dependencies using Graph Convolutional Networks (GCN), showing that GCN outperforms traditional time-series models by considering both temporal and relational dynamics (Singh et al. 2021, Chen et al. 2018). These findings illustrate that GNNs improve stock market predictions by capturing complex relationships between stocks, and when combined with models such as LSTM, they effectively handle both relational and temporal dynamics.\n", + "\n", + "**3.2 Graphical Representation of the Stock Network**\n", + "\n", + "- Stocks were modeled using a graph \\( G = (V, E) \\) where each vertex \\( v \\in V \\) represents a stock, and each edge \\( e \\in E \\) represents a significant relationship between two stocks.\n", + "- Pearson correlation coefficients were calculated between daily returns of stock pairs to quantify linear relationships.\n", + "- Daily return \\( r_t \\) for a stock at time \\( t \\) was computed as:\n", + "\n", + " \\[\n", + " r_t = \\frac{P_t - P_{t-1}}{P_{t-1}}\n", + " \\]\n", + "\n", + " where \\( P_t \\) is the closing price at time \\( t \\).\n", + "\n", + "- Pearson correlation coefficient \\( \\rho_{ij} \\) between stocks \\( i \\) and \\( j \\) was computed as:\n", + "\n", + " \\[\n", + " \\rho_{ij} = \\frac{\\sum_{t=1}^{n} (r_{i,t} - \\bar{r_i})(r_{j,t} - \\bar{r_j})}{\\sqrt{\\sum_{t=1}^{n} (r_{i,t} - \\bar{r_i})^2} \\sqrt{\\sum_{t=1}^{n} (r_{j,t} - \\bar{r_j})^2}}\n", + " \\]\n", + "\n", + "**3.4 GNN Component**\n", + "\n", + "- The GNN component modeled relational dependencies among stocks based on the constructed graph.\n", + "- Architecture included an input layer that received the stock graph and initial node features.\n", + "---DECISION: DOCS RELEVANT---\n", + "\"Output from node 'retrieve':\"\n", + "'---'\n", + "{ 'messages': [ ToolMessage(content=\"**3.4 GNN Component**\\n\\n- The GNN component modeled relational dependencies among stocks based on the constructed graph.\\n- Architecture included an input layer that received the stock graph and initial node features.\\n\\nFor the GNN component, a graph representing relationships between stocks was constructed. Each node represented a stock, and edges represented relationships based on Pearson correlation coefficients and association analysis. Pearson correlation captured linear relationships by calculating the correlation coefficients between the daily returns of stock pairs, while association analysis identified non-linear and complex relationships. Combining these methods provided a comprehensive representation of inter-stock relationships, crucial for modelling the interconnected nature of the stock market.\\n\\n3.1.2 Data Preprocessing\\n\\nData preprocessing focused on scaling and structuring the data. The raw data was clean, with no missing values or significant outliers. Min-Max Scaling was applied to normalise the data. Specifically, each stock price \\\\( x \\\\) was transformed to a normalised value \\\\( x' \\\\) in the range [0, 1] using\\n\\n\\\\[ x' = \\\\frac{x - x_{\\\\text{min}}}{x_{\\\\text{max}} - x_{\\\\text{min}}}, \\\\]\\n\\nGNNs are essential for analysing stock relationships by capturing dependencies between stocks, modelling the interconnections that traditional methods overlook. (Shi et al. 2024) developed a graph-based GCN-LSTM model integrating relational data with time-series analysis, achieving more accurate predictions by leveraging GNN to capture stock interconnections. Other studies have emphasised the importance of capturing relational dependencies using Graph Convolutional Networks (GCN), showing that GCN outperforms traditional time-series models by considering both temporal and relational dynamics (Singh et al. 2021, Chen et al. 2018). These findings illustrate that GNNs improve stock market predictions by capturing complex relationships between stocks, and when combined with models such as LSTM, they effectively handle both relational and temporal dynamics.\\n\\n**3.3 LSTM Component**\\n\\n- The LSTM network captured temporal dependencies in stock prices by processing sequences of historical data.\\n- Architecture included an input layer for normalized closing price sequences over optimized time windows.\\n- Multiple LSTM layers were stacked to learn complex temporal patterns using gating mechanisms.\\n- A dense layer transformed LSTM outputs into feature vectors for integration with the GNN component.\\n- Hyperparameters such as learning rate, batch size, and number of epochs were optimized through experimentation.\\n- Learning rates of 0.001, 0.005, and 0.01 were tested, with batch sizes of 11 and 21.\\n- Number of epochs varied between 10 and 50, using early stopping to prevent overfitting.\\n- The Adam optimizer was used for efficient training, and Mean Squared Error (MSE) served as the loss function.\\n\\n**3.4 GNN Component**\", name='document_understanding', id='6ec9ad91-d172-421f-9cf7-7b419c4bba54', tool_call_id='call_orodPM8UEjVSn7vR8uGQ2g2B'),\n", + " ToolMessage(content=\"For the GNN component, a graph representing relationships between stocks was constructed. Each node represented a stock, and edges represented relationships based on Pearson correlation coefficients and association analysis. Pearson correlation captured linear relationships by calculating the correlation coefficients between the daily returns of stock pairs, while association analysis identified non-linear and complex relationships. Combining these methods provided a comprehensive representation of inter-stock relationships, crucial for modelling the interconnected nature of the stock market.\\n\\n3.1.2 Data Preprocessing\\n\\nData preprocessing focused on scaling and structuring the data. The raw data was clean, with no missing values or significant outliers. Min-Max Scaling was applied to normalise the data. Specifically, each stock price \\\\( x \\\\) was transformed to a normalised value \\\\( x' \\\\) in the range [0, 1] using\\n\\n\\\\[ x' = \\\\frac{x - x_{\\\\text{min}}}{x_{\\\\text{max}} - x_{\\\\text{min}}}, \\\\]\\n\\nGNNs are essential for analysing stock relationships by capturing dependencies between stocks, modelling the interconnections that traditional methods overlook. (Shi et al. 2024) developed a graph-based GCN-LSTM model integrating relational data with time-series analysis, achieving more accurate predictions by leveraging GNN to capture stock interconnections. Other studies have emphasised the importance of capturing relational dependencies using Graph Convolutional Networks (GCN), showing that GCN outperforms traditional time-series models by considering both temporal and relational dynamics (Singh et al. 2021, Chen et al. 2018). These findings illustrate that GNNs improve stock market predictions by capturing complex relationships between stocks, and when combined with models such as LSTM, they effectively handle both relational and temporal dynamics.\\n\\n**3.2 Graphical Representation of the Stock Network**\\n\\n- Stocks were modeled using a graph \\\\( G = (V, E) \\\\) where each vertex \\\\( v \\\\in V \\\\) represents a stock, and each edge \\\\( e \\\\in E \\\\) represents a significant relationship between two stocks.\\n- Pearson correlation coefficients were calculated between daily returns of stock pairs to quantify linear relationships.\\n- Daily return \\\\( r_t \\\\) for a stock at time \\\\( t \\\\) was computed as:\\n\\n \\\\[\\n r_t = \\\\frac{P_t - P_{t-1}}{P_{t-1}}\\n \\\\]\\n\\n where \\\\( P_t \\\\) is the closing price at time \\\\( t \\\\).\\n\\n- Pearson correlation coefficient \\\\( \\\\rho_{ij} \\\\) between stocks \\\\( i \\\\) and \\\\( j \\\\) was computed as:\\n\\n \\\\[\\n \\\\rho_{ij} = \\\\frac{\\\\sum_{t=1}^{n} (r_{i,t} - \\\\bar{r_i})(r_{j,t} - \\\\bar{r_j})}{\\\\sqrt{\\\\sum_{t=1}^{n} (r_{i,t} - \\\\bar{r_i})^2} \\\\sqrt{\\\\sum_{t=1}^{n} (r_{j,t} - \\\\bar{r_j})^2}}\\n \\\\]\\n\\n**3.4 GNN Component**\\n\\n- The GNN component modeled relational dependencies among stocks based on the constructed graph.\\n- Architecture included an input layer that received the stock graph and initial node features.\", name='document_understanding', id='37905d9b-c537-429a-b3b3-f3d37cf920a6', tool_call_id='call_w2YvW3VYPvUg5IWY0C9C1gvn')]}\n", + "'\\n---\\n'\n", + "---GENERATE---\n", + "context: For the GNN component, a graph representing relationships between stocks was constructed. Each node represented a stock, and edges represented relationships based on Pearson correlation coefficients and association analysis. Pearson correlation captured linear relationships by calculating the correlation coefficients between the daily returns of stock pairs, while association analysis identified non-linear and complex relationships. Combining these methods provided a comprehensive representation of inter-stock relationships, crucial for modelling the interconnected nature of the stock market.\n", + "\n", + "3.1.2 Data Preprocessing\n", + "\n", + "Data preprocessing focused on scaling and structuring the data. The raw data was clean, with no missing values or significant outliers. Min-Max Scaling was applied to normalise the data. Specifically, each stock price \\( x \\) was transformed to a normalised value \\( x' \\) in the range [0, 1] using\n", + "\n", + "\\[ x' = \\frac{x - x_{\\text{min}}}{x_{\\text{max}} - x_{\\text{min}}}, \\]\n", + "\n", + "GNNs are essential for analysing stock relationships by capturing dependencies between stocks, modelling the interconnections that traditional methods overlook. (Shi et al. 2024) developed a graph-based GCN-LSTM model integrating relational data with time-series analysis, achieving more accurate predictions by leveraging GNN to capture stock interconnections. Other studies have emphasised the importance of capturing relational dependencies using Graph Convolutional Networks (GCN), showing that GCN outperforms traditional time-series models by considering both temporal and relational dynamics (Singh et al. 2021, Chen et al. 2018). These findings illustrate that GNNs improve stock market predictions by capturing complex relationships between stocks, and when combined with models such as LSTM, they effectively handle both relational and temporal dynamics.\n", + "\n", + "**3.2 Graphical Representation of the Stock Network**\n", + "\n", + "- Stocks were modeled using a graph \\( G = (V, E) \\) where each vertex \\( v \\in V \\) represents a stock, and each edge \\( e \\in E \\) represents a significant relationship between two stocks.\n", + "- Pearson correlation coefficients were calculated between daily returns of stock pairs to quantify linear relationships.\n", + "- Daily return \\( r_t \\) for a stock at time \\( t \\) was computed as:\n", + "\n", + " \\[\n", + " r_t = \\frac{P_t - P_{t-1}}{P_{t-1}}\n", + " \\]\n", + "\n", + " where \\( P_t \\) is the closing price at time \\( t \\).\n", + "\n", + "- Pearson correlation coefficient \\( \\rho_{ij} \\) between stocks \\( i \\) and \\( j \\) was computed as:\n", + "\n", + " \\[\n", + " \\rho_{ij} = \\frac{\\sum_{t=1}^{n} (r_{i,t} - \\bar{r_i})(r_{j,t} - \\bar{r_j})}{\\sqrt{\\sum_{t=1}^{n} (r_{i,t} - \\bar{r_i})^2} \\sqrt{\\sum_{t=1}^{n} (r_{j,t} - \\bar{r_j})^2}}\n", + " \\]\n", + "\n", + "**3.4 GNN Component**\n", + "\n", + "- The GNN component modeled relational dependencies among stocks based on the constructed graph.\n", + "- Architecture included an input layer that received the stock graph and initial node features.\n", + "question: \n", + " How is the graph constructed for the GNN component, and what metrics are used to define relationships between stocks?\n", + " \n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/langsmith/client.py:272: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\"Output from node 'generate':\"\n", + "'---'\n", + "{ 'messages': [ 'The graph for the GNN component is constructed with nodes '\n", + " 'representing stocks and edges representing relationships '\n", + " 'between them. These relationships are defined using Pearson '\n", + " 'correlation coefficients to capture linear relationships and '\n", + " 'association analysis to identify non-linear and complex '\n", + " 'relationships. This combination provides a comprehensive '\n", + " 'representation of inter-stock relationships.']}\n", + "'\\n---\\n'\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "inputs = {\n", + " \"messages\": [\n", + " (\"user\", \"\"\"\n", + " How is the graph constructed for the GNN component, and what metrics are used to define relationships between stocks?\n", + " \"\"\"),\n", + " ]\n", + "}\n", + "for output in graph.stream(inputs):\n", + " for key, value in output.items():\n", + " pprint.pprint(f\"Output from node '{key}':\")\n", + " pprint.pprint(\"---\")\n", + " pprint.pprint(value, indent=2, width=80, depth=None)\n", + " pprint.pprint(\"\\n---\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a2413706", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a2413706", + "outputId": "a89d6e1d-f8c5-4021-d43b-ef772163b441" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "---CALL AGENT---\n", + "\"Output from node 'agent':\"\n", + "'---'\n", + "{ 'messages': [ AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_wtkFsHQ6KTgihfM7xWJpvdBX', 'function': {'arguments': '{\"query\":\"MSE of CNN in figure 5\"}', 'name': 'document_understanding'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'service_tier': 'default'}, id='run--6fb0406c-431a-470f-91c4-eed77a92487b-0', tool_calls=[{'name': 'document_understanding', 'args': {'query': 'MSE of CNN in figure 5'}, 'id': 'call_wtkFsHQ6KTgihfM7xWJpvdBX', 'type': 'tool_call'}])]}\n", + "'\\n---\\n'\n", + "---CHECK RELEVANCE---\n", + "question: \n", + " who is MSE of CNN in the figure 5 ?\n", + " \n", + "context: **Heatmap Information:**\n", + "\n", + "- **Title:** MSE Heatmap for Models and Stocks\n", + "- **Models:** Dense, CNN, Middle, LSTM, LSTM+GNN\n", + "- **Stocks:** AAPL, MSFT, CMCSA, COST, QCOM, ADBE, SBUX, NTU, AMD, INTC\n", + "- **Color Scale:** Ranges from 0.001 to 0.007\n", + "\n", + "**Figure Caption:**\n", + "Figure 6: Heatmap of MSE values for different models across individual stocks.\n", + "\n", + "**Text Extract:**\n", + "\n", + "**4.6 Comparative Study**\n", + "\n", + "The comparative analysis demonstrates the clear advantages of the hybrid LSTM-GNN model over baseline models in predictive accuracy and robustness. While the standalone LSTM captured temporal dependencies effectively, it lacked the ability to model inter-stock relationships significantly influencing market behaviour. Incorporating the GNN component enabled the hybrid model to utilise relational data, capturing complex interactions between stocks and enhancing predictions.\n", + "\n", + "4.5 Comparison with Baseline Models\n", + "\n", + "The hybrid LSTM-GNN model was evaluated against several baseline models, including Linear Regression, CNN, DNN, and a standalone LSTM. The hybrid model outperformed all baselines in terms of MSE.\n", + "\n", + "- Hybrid LSTM-GNN: MSE of 0.00144\n", + "- Linear Regression: MSE of 0.00224\n", + "- Standalone LSTM: MSE of 0.00161\n", + "- CNN: MSE of 0.00302\n", + "- DNN: MSE of 0.00335\n", + "\n", + "The hybrid model consistently achieved lower MSE values across all stocks, demonstrating robustness and generalizability. It performed well for stocks like CMCSA, AMD, and INTC, while CNN and DNN models had higher MSE values, especially for more volatile stocks.\n", + "\n", + "Figure 5: Comparison of MSE values across different models.\n", + "\n", + "- **Evaluation Metric**: Mean Squared Error (MSE) defined as:\n", + "\n", + " \\[\n", + " \\text{MSE} = \\frac{1}{n} \\sum_{i=1}^{n} (\\hat{y}_i - y_i)^2\n", + " \\]\n", + "\n", + " where \\( n \\) is the number of predictions, \\( \\hat{y}_i \\) is the predicted stock price, and \\( y_i \\) is the actual stock price.\n", + "\n", + "- **Performance Analysis**:\n", + " - The model effectively integrated time-series and relational data.\n", + " - Consistently low MSE values were observed, with spikes on November 10, 2022, and November 30, 2022, due to market volatility.\n", + "\n", + "- **Figure 4**: Displays MSE values across all test days using the best parameter configuration.\n", + "\n", + "Linear Regression, as a simple baseline model, showed limitations in capturing non-linear relationships and temporal dependencies, resulting in an MSE of 0.00224. Although its MSE was lower than that of the CNN and DNN models, it was less effective than the LSTM-based models due to its inability to model the intricate dynamics of stock markets. These limitations were especially evident in more volatile stocks, where Linear Regression struggled with complex market movements.\n", + "\n", + "A key factor driving the hybrid model superior performance was the expanding window training approach. This method progressively increased the training dataset by incorporating new data as it became available, enabling the model to remain up-to-date with recent market trends. Retraining with the most current data enabled the hybrid LSTM-GNN to continuously adapt to changes in market behaviour.\n", + "---DECISION: DOCS RELEVANT---\n", + "\"Output from node 'retrieve':\"\n", + "'---'\n", + "{ 'messages': [ ToolMessage(content='**Heatmap Information:**\\n\\n- **Title:** MSE Heatmap for Models and Stocks\\n- **Models:** Dense, CNN, Middle, LSTM, LSTM+GNN\\n- **Stocks:** AAPL, MSFT, CMCSA, COST, QCOM, ADBE, SBUX, NTU, AMD, INTC\\n- **Color Scale:** Ranges from 0.001 to 0.007\\n\\n**Figure Caption:**\\nFigure 6: Heatmap of MSE values for different models across individual stocks.\\n\\n**Text Extract:**\\n\\n**4.6 Comparative Study**\\n\\nThe comparative analysis demonstrates the clear advantages of the hybrid LSTM-GNN model over baseline models in predictive accuracy and robustness. While the standalone LSTM captured temporal dependencies effectively, it lacked the ability to model inter-stock relationships significantly influencing market behaviour. Incorporating the GNN component enabled the hybrid model to utilise relational data, capturing complex interactions between stocks and enhancing predictions.\\n\\n4.5 Comparison with Baseline Models\\n\\nThe hybrid LSTM-GNN model was evaluated against several baseline models, including Linear Regression, CNN, DNN, and a standalone LSTM. The hybrid model outperformed all baselines in terms of MSE.\\n\\n- Hybrid LSTM-GNN: MSE of 0.00144\\n- Linear Regression: MSE of 0.00224\\n- Standalone LSTM: MSE of 0.00161\\n- CNN: MSE of 0.00302\\n- DNN: MSE of 0.00335\\n\\nThe hybrid model consistently achieved lower MSE values across all stocks, demonstrating robustness and generalizability. It performed well for stocks like CMCSA, AMD, and INTC, while CNN and DNN models had higher MSE values, especially for more volatile stocks.\\n\\nFigure 5: Comparison of MSE values across different models.\\n\\n- **Evaluation Metric**: Mean Squared Error (MSE) defined as:\\n\\n \\\\[\\n \\\\text{MSE} = \\\\frac{1}{n} \\\\sum_{i=1}^{n} (\\\\hat{y}_i - y_i)^2\\n \\\\]\\n\\n where \\\\( n \\\\) is the number of predictions, \\\\( \\\\hat{y}_i \\\\) is the predicted stock price, and \\\\( y_i \\\\) is the actual stock price.\\n\\n- **Performance Analysis**:\\n - The model effectively integrated time-series and relational data.\\n - Consistently low MSE values were observed, with spikes on November 10, 2022, and November 30, 2022, due to market volatility.\\n\\n- **Figure 4**: Displays MSE values across all test days using the best parameter configuration.\\n\\nLinear Regression, as a simple baseline model, showed limitations in capturing non-linear relationships and temporal dependencies, resulting in an MSE of 0.00224. Although its MSE was lower than that of the CNN and DNN models, it was less effective than the LSTM-based models due to its inability to model the intricate dynamics of stock markets. These limitations were especially evident in more volatile stocks, where Linear Regression struggled with complex market movements.\\n\\nA key factor driving the hybrid model superior performance was the expanding window training approach. This method progressively increased the training dataset by incorporating new data as it became available, enabling the model to remain up-to-date with recent market trends. Retraining with the most current data enabled the hybrid LSTM-GNN to continuously adapt to changes in market behaviour.', name='document_understanding', id='7f28343b-0dcb-40d1-834e-50c0a50a995b', tool_call_id='call_wtkFsHQ6KTgihfM7xWJpvdBX')]}\n", + "'\\n---\\n'\n", + "---GENERATE---\n", + "context: **Heatmap Information:**\n", + "\n", + "- **Title:** MSE Heatmap for Models and Stocks\n", + "- **Models:** Dense, CNN, Middle, LSTM, LSTM+GNN\n", + "- **Stocks:** AAPL, MSFT, CMCSA, COST, QCOM, ADBE, SBUX, NTU, AMD, INTC\n", + "- **Color Scale:** Ranges from 0.001 to 0.007\n", + "\n", + "**Figure Caption:**\n", + "Figure 6: Heatmap of MSE values for different models across individual stocks.\n", + "\n", + "**Text Extract:**\n", + "\n", + "**4.6 Comparative Study**\n", + "\n", + "The comparative analysis demonstrates the clear advantages of the hybrid LSTM-GNN model over baseline models in predictive accuracy and robustness. While the standalone LSTM captured temporal dependencies effectively, it lacked the ability to model inter-stock relationships significantly influencing market behaviour. Incorporating the GNN component enabled the hybrid model to utilise relational data, capturing complex interactions between stocks and enhancing predictions.\n", + "\n", + "4.5 Comparison with Baseline Models\n", + "\n", + "The hybrid LSTM-GNN model was evaluated against several baseline models, including Linear Regression, CNN, DNN, and a standalone LSTM. The hybrid model outperformed all baselines in terms of MSE.\n", + "\n", + "- Hybrid LSTM-GNN: MSE of 0.00144\n", + "- Linear Regression: MSE of 0.00224\n", + "- Standalone LSTM: MSE of 0.00161\n", + "- CNN: MSE of 0.00302\n", + "- DNN: MSE of 0.00335\n", + "\n", + "The hybrid model consistently achieved lower MSE values across all stocks, demonstrating robustness and generalizability. It performed well for stocks like CMCSA, AMD, and INTC, while CNN and DNN models had higher MSE values, especially for more volatile stocks.\n", + "\n", + "Figure 5: Comparison of MSE values across different models.\n", + "\n", + "- **Evaluation Metric**: Mean Squared Error (MSE) defined as:\n", + "\n", + " \\[\n", + " \\text{MSE} = \\frac{1}{n} \\sum_{i=1}^{n} (\\hat{y}_i - y_i)^2\n", + " \\]\n", + "\n", + " where \\( n \\) is the number of predictions, \\( \\hat{y}_i \\) is the predicted stock price, and \\( y_i \\) is the actual stock price.\n", + "\n", + "- **Performance Analysis**:\n", + " - The model effectively integrated time-series and relational data.\n", + " - Consistently low MSE values were observed, with spikes on November 10, 2022, and November 30, 2022, due to market volatility.\n", + "\n", + "- **Figure 4**: Displays MSE values across all test days using the best parameter configuration.\n", + "\n", + "Linear Regression, as a simple baseline model, showed limitations in capturing non-linear relationships and temporal dependencies, resulting in an MSE of 0.00224. Although its MSE was lower than that of the CNN and DNN models, it was less effective than the LSTM-based models due to its inability to model the intricate dynamics of stock markets. These limitations were especially evident in more volatile stocks, where Linear Regression struggled with complex market movements.\n", + "\n", + "A key factor driving the hybrid model superior performance was the expanding window training approach. This method progressively increased the training dataset by incorporating new data as it became available, enabling the model to remain up-to-date with recent market trends. Retraining with the most current data enabled the hybrid LSTM-GNN to continuously adapt to changes in market behaviour.\n", + "question: \n", + " who is MSE of CNN in the figure 5 ?\n", + " \n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/langsmith/client.py:272: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\"Output from node 'generate':\"\n", + "'---'\n", + "{'messages': ['The MSE of CNN in Figure 5 is 0.00302.']}\n", + "'\\n---\\n'\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "inputs = {\n", + " \"messages\": [\n", + " (\"user\", \"\"\"\n", + " who is MSE of CNN in the figure 5 ?\n", + " \"\"\"),\n", + " ]\n", + "}\n", + "for output in graph.stream(inputs):\n", + " for key, value in output.items():\n", + " pprint.pprint(f\"Output from node '{key}':\")\n", + " pprint.pprint(\"---\")\n", + " pprint.pprint(value, indent=2, width=80, depth=None)\n", + " pprint.pprint(\"\\n---\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fed0e282", + "metadata": { + "id": "fed0e282", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "849eb6d9-0f3b-44d3-cec1-36504a697551" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "---CALL AGENT---\n", + "\"Output from node 'agent':\"\n", + "'---'\n", + "{ 'messages': [ AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_AvQ7tFlDQbNbkBYhyQ0HSJj3', 'function': {'arguments': '{\"query\":\"highest MSE value in figure 4\"}', 'name': 'document_understanding'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'service_tier': 'default'}, id='run--605026a7-2674-4317-b3b4-8374b0e06729-0', tool_calls=[{'name': 'document_understanding', 'args': {'query': 'highest MSE value in figure 4'}, 'id': 'call_AvQ7tFlDQbNbkBYhyQ0HSJj3', 'type': 'tool_call'}])]}\n", + "'\\n---\\n'\n", + "---CHECK RELEVANCE---\n", + "question: \n", + " what could be the number of test day corresponding to the highest MSE value in figure 4?\n", + " \n", + "context: - **Evaluation Metric**: Mean Squared Error (MSE) defined as:\n", + "\n", + " \\[\n", + " \\text{MSE} = \\frac{1}{n} \\sum_{i=1}^{n} (\\hat{y}_i - y_i)^2\n", + " \\]\n", + "\n", + " where \\( n \\) is the number of predictions, \\( \\hat{y}_i \\) is the predicted stock price, and \\( y_i \\) is the actual stock price.\n", + "\n", + "- **Performance Analysis**:\n", + " - The model effectively integrated time-series and relational data.\n", + " - Consistently low MSE values were observed, with spikes on November 10, 2022, and November 30, 2022, due to market volatility.\n", + "\n", + "- **Figure 4**: Displays MSE values across all test days using the best parameter configuration.\n", + "\n", + "**Heatmap Information:**\n", + "\n", + "- **Title:** MSE Heatmap for Models and Stocks\n", + "- **Models:** Dense, CNN, Middle, LSTM, LSTM+GNN\n", + "- **Stocks:** AAPL, MSFT, CMCSA, COST, QCOM, ADBE, SBUX, NTU, AMD, INTC\n", + "- **Color Scale:** Ranges from 0.001 to 0.007\n", + "\n", + "**Figure Caption:**\n", + "Figure 6: Heatmap of MSE values for different models across individual stocks.\n", + "\n", + "**Text Extract:**\n", + "\n", + "**4.6 Comparative Study**\n", + "\n", + "The comparative analysis demonstrates the clear advantages of the hybrid LSTM-GNN model over baseline models in predictive accuracy and robustness. While the standalone LSTM captured temporal dependencies effectively, it lacked the ability to model inter-stock relationships significantly influencing market behaviour. Incorporating the GNN component enabled the hybrid model to utilise relational data, capturing complex interactions between stocks and enhancing predictions.\n", + "\n", + "4.5 Comparison with Baseline Models\n", + "\n", + "The hybrid LSTM-GNN model was evaluated against several baseline models, including Linear Regression, CNN, DNN, and a standalone LSTM. The hybrid model outperformed all baselines in terms of MSE.\n", + "\n", + "- Hybrid LSTM-GNN: MSE of 0.00144\n", + "- Linear Regression: MSE of 0.00224\n", + "- Standalone LSTM: MSE of 0.00161\n", + "- CNN: MSE of 0.00302\n", + "- DNN: MSE of 0.00335\n", + "\n", + "The hybrid model consistently achieved lower MSE values across all stocks, demonstrating robustness and generalizability. It performed well for stocks like CMCSA, AMD, and INTC, while CNN and DNN models had higher MSE values, especially for more volatile stocks.\n", + "\n", + "Figure 5: Comparison of MSE values across different models.\n", + "\n", + "**Text Extracted from Image:**\n", + "\n", + "Incorporated (QCOM), Adobe Inc. (ADBE), Starbucks Corporation (SBUX), Intuit Inc. (INTU), Advanced Micro Devices (AMD), and Intel Corporation (INTC). These stocks were chosen due to their significant market capitalisation and influence, ensuring broad applicability of the findings.\n", + "\n", + "The data spanned from January 1, 2005, to December 31, 2023, encompassing various market conditions and providing a robust dataset for model training and evaluation. Features extracted included daily open, high, low, close, adjusted close prices, and trading volume, offering a detailed view of market activity. Figure 1 shows an example of normalised closing prices for a sample stock, along with its 50-day and 200-day moving averages.\n", + "\n", + "Figure 1: Normalised closing prices of a sample stock with its 50-day and 200-day moving averages.\n", + "\n", + "3.1.1 Feature Engineering\n", + "---DECISION: DOCS NOT RELEVANT---\n", + "no\n", + "\"Output from node 'retrieve':\"\n", + "'---'\n", + "{ 'messages': [ ToolMessage(content='- **Evaluation Metric**: Mean Squared Error (MSE) defined as:\\n\\n \\\\[\\n \\\\text{MSE} = \\\\frac{1}{n} \\\\sum_{i=1}^{n} (\\\\hat{y}_i - y_i)^2\\n \\\\]\\n\\n where \\\\( n \\\\) is the number of predictions, \\\\( \\\\hat{y}_i \\\\) is the predicted stock price, and \\\\( y_i \\\\) is the actual stock price.\\n\\n- **Performance Analysis**:\\n - The model effectively integrated time-series and relational data.\\n - Consistently low MSE values were observed, with spikes on November 10, 2022, and November 30, 2022, due to market volatility.\\n\\n- **Figure 4**: Displays MSE values across all test days using the best parameter configuration.\\n\\n**Heatmap Information:**\\n\\n- **Title:** MSE Heatmap for Models and Stocks\\n- **Models:** Dense, CNN, Middle, LSTM, LSTM+GNN\\n- **Stocks:** AAPL, MSFT, CMCSA, COST, QCOM, ADBE, SBUX, NTU, AMD, INTC\\n- **Color Scale:** Ranges from 0.001 to 0.007\\n\\n**Figure Caption:**\\nFigure 6: Heatmap of MSE values for different models across individual stocks.\\n\\n**Text Extract:**\\n\\n**4.6 Comparative Study**\\n\\nThe comparative analysis demonstrates the clear advantages of the hybrid LSTM-GNN model over baseline models in predictive accuracy and robustness. While the standalone LSTM captured temporal dependencies effectively, it lacked the ability to model inter-stock relationships significantly influencing market behaviour. Incorporating the GNN component enabled the hybrid model to utilise relational data, capturing complex interactions between stocks and enhancing predictions.\\n\\n4.5 Comparison with Baseline Models\\n\\nThe hybrid LSTM-GNN model was evaluated against several baseline models, including Linear Regression, CNN, DNN, and a standalone LSTM. The hybrid model outperformed all baselines in terms of MSE.\\n\\n- Hybrid LSTM-GNN: MSE of 0.00144\\n- Linear Regression: MSE of 0.00224\\n- Standalone LSTM: MSE of 0.00161\\n- CNN: MSE of 0.00302\\n- DNN: MSE of 0.00335\\n\\nThe hybrid model consistently achieved lower MSE values across all stocks, demonstrating robustness and generalizability. It performed well for stocks like CMCSA, AMD, and INTC, while CNN and DNN models had higher MSE values, especially for more volatile stocks.\\n\\nFigure 5: Comparison of MSE values across different models.\\n\\n**Text Extracted from Image:**\\n\\nIncorporated (QCOM), Adobe Inc. (ADBE), Starbucks Corporation (SBUX), Intuit Inc. (INTU), Advanced Micro Devices (AMD), and Intel Corporation (INTC). These stocks were chosen due to their significant market capitalisation and influence, ensuring broad applicability of the findings.\\n\\nThe data spanned from January 1, 2005, to December 31, 2023, encompassing various market conditions and providing a robust dataset for model training and evaluation. Features extracted included daily open, high, low, close, adjusted close prices, and trading volume, offering a detailed view of market activity. Figure 1 shows an example of normalised closing prices for a sample stock, along with its 50-day and 200-day moving averages.\\n\\nFigure 1: Normalised closing prices of a sample stock with its 50-day and 200-day moving averages.\\n\\n3.1.1 Feature Engineering', name='document_understanding', id='5e6753b9-5613-4729-a8b5-e04d72c677f9', tool_call_id='call_AvQ7tFlDQbNbkBYhyQ0HSJj3')]}\n", + "'\\n---\\n'\n", + "---TRANSFORM QUERY---\n", + "\"Output from node 'rewrite':\"\n", + "'---'\n", + "{ 'messages': [ AIMessage(content='What is the test day associated with the highest Mean Squared Error (MSE) value shown in Figure 4?', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'service_tier': 'default'}, id='run--b2158157-4e43-416f-a7ff-d1777f27b851-0')]}\n", + "'\\n---\\n'\n", + "---CALL AGENT---\n", + "\"Output from node 'agent':\"\n", + "'---'\n", + "{ 'messages': [ AIMessage(content='The test days corresponding to the highest MSE values in Figure 4 are November 10, 2022, and November 30, 2022, due to market volatility.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'service_tier': 'default'}, id='run--b4b69c5b-1bf3-4691-a042-58c6e44eea3b-0')]}\n", + "'\\n---\\n'\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "inputs = {\n", + " \"messages\": [\n", + " (\"user\", \"\"\"\n", + " what could be the number of test day corresponding to the highest MSE value in figure 4?\n", + " \"\"\"),\n", + " ]\n", + "}\n", + "for output in graph.stream(inputs):\n", + " for key, value in output.items():\n", + " pprint.pprint(f\"Output from node '{key}':\")\n", + " pprint.pprint(\"---\")\n", + " pprint.pprint(value, indent=2, width=80, depth=None)\n", + " pprint.pprint(\"\\n---\\n\")" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "vgBtUdvpePbw" + }, + "id": "vgBtUdvpePbw", + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Sag01HKhlFHJ" + }, + "id": "Sag01HKhlFHJ", + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "bd-G3xpHlT1x" + }, + "id": "bd-G3xpHlT1x", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "GCX3PV0Wnf8P" + }, + "id": "GCX3PV0Wnf8P", + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/Agentic-langGraph-RAG/readme.md b/Agentic-langGraph-RAG/readme.md new file mode 100644 index 0000000..22a8d5a --- /dev/null +++ b/Agentic-langGraph-RAG/readme.md @@ -0,0 +1,144 @@ +# Agentic PDF RAG System + +An intelligent document analysis system that combines PDF processing, vector search, and agentic AI workflows to provide accurate answers from document content. + +## Overview + +This system processes PDF documents by converting them to images, extracting text using GPT-4o vision capabilities, and creating a searchable knowledge base. An intelligent agent then handles user queries by deciding when to retrieve information, evaluating document relevance, and generating accurate responses. + +## Key Features + +- **PDF to Text Conversion**: Uses pypdfium2 to convert PDF pages to high-quality images +- **OCR with GPT-4o**: Leverages GPT-4o vision model for accurate text extraction +- **Vector Database**: Stores embeddings in Qdrant for efficient similarity search +- **Intelligent Agent**: LangGraph-powered agent that makes smart retrieval decisions +- **Document Grading**: Automatically evaluates relevance of retrieved documents +- **Query Rewriting**: Improves queries when initial results are not relevant +- **Expanding Window Training**: Demonstrates advanced ML techniques for stock prediction + +## System Architecture + +```mermaid +flowchart TD + A[PDF Document] --> B[Convert Pages to Images] + B --> C[GPT-4o OCR Text Extraction] + C --> D[Text Preprocessing & Chunking] + D --> E[Generate Embeddings] + E --> F[Store in Qdrant Vector DB] + + G[User Query] --> H[Agent Node] + H --> I{Retrieve Documents?} + + I -->|Yes| J[Vector Search & Retrieval] + I -->|No| K[Direct Response] + + J --> L[Grade Document Relevance] + L --> M{Documents Relevant?} + + M -->|Yes| N[Generate Answer] + M -->|No| O[Rewrite Query] + + O --> H + N --> P[Final Response] + K --> P + + F -.-> J + + subgraph "Document Processing" + A + B + C + D + E + F + end + + subgraph "Query Processing" + G + H + I + J + L + M + N + O + K + P + end +``` + +## How It Works + +### Document Processing +1. **PDF Conversion**: PDF pages are converted to high-resolution images +2. **Text Extraction**: GPT-4o analyzes images and extracts text content +3. **Preprocessing**: Text is cleaned, chunked, and prepared for embedding +4. **Vector Storage**: Document chunks are embedded and stored in Qdrant + +### Query Processing +1. **Agent Decision**: Intelligent agent decides whether to retrieve documents +2. **Vector Search**: If needed, performs similarity search in vector database +3. **Relevance Grading**: Evaluates if retrieved documents answer the query +4. **Response Generation**: Creates final answer or rewrites query if needed + +## Example Use Case + +The system is demonstrated with a research paper on "Stock Price Prediction Using Hybrid LSTM-GNN Model". Users can ask questions like: + +- "How is the graph constructed for the GNN component?" +- "What is the MSE of CNN in Figure 5?" +- "What are the test days with highest MSE values?" + +## Technologies Used + +- **LangChain**: Framework for building LLM applications +- **LangGraph**: Agent workflow orchestration +- **OpenAI GPT-4o**: Vision and text generation model +- **Qdrant**: Vector database for similarity search +- **pypdfium2**: PDF processing and image conversion +- **Python**: Core programming language + +## Installation + +```bash +pip install pypdfium2 backoff langchain-community langchain langchain-openai langgraph qdrant-client +``` + +## Configuration + +Set your API keys: +```python +OPENAI_API_KEY = "your-openai-api-key" +QDRANT_API_KEY = "your-qdrant-api-key" # Optional for local deployment +QDRANT_URL = "your-qdrant-url" # Optional for local deployment +``` + +## Usage + +1. Load your PDF document +2. Run the document processing pipeline +3. Start querying the system with natural language questions +4. The agent will intelligently retrieve and process relevant information + +## Benefits + +- **Intelligent Retrieval**: Only searches when necessary +- **Quality Control**: Validates document relevance before responding +- **Adaptive**: Improves queries automatically when initial results are poor +- **Accurate**: Combines vision-based OCR with semantic search +- **Scalable**: Vector database enables fast search across large document collections + +This system demonstrates advanced RAG (Retrieval-Augmented Generation) techniques with agentic AI workflows for robust document analysis and question answering. + +## Tutorial Article +For a detailed step-by-step guide on building this system, read the full tutorial on Medium: +[**How I Built an Agentic RAG System with Qdrant to Chat with Any PDF**](https://medium.com/@mohammedarbinsibi/how-i-built-an-agentic-rag-system-with-qdrant-to-chat-with-any-pdf-4f680e93397e) + + + +### References : +* [LangChain](https://github.com/langchain-ai/langchain) +* [LangGraph](https://langchain-ai.github.io/langgraph/) +* [LangGraph Agentic RAG](https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_agentic_rag.ipynb) +* [Qdrant documentation](https://qdrant.tech/documentation/) +* [LSTM-GNN paper](https://arxiv.org/pdf/2502.15813) diff --git a/README.md b/README.md index d8b9cfa..1656c4e 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,5 @@ This repo contains a collection of tutorials, demos, and how-to guides on how to | [Step-back prompting in Langchain RAG](./langchain-qdrant-step-back-prompting) | Step-back prompting for RAG, implemented in Langchain | OpenAI, Qdrant, Cohere, Langchain | | [Collaborative Filtering and MovieLens](./sparse-vectors-movies-reco) | A notebook demonstrating how to build a collaborative filtering system using Qdrant | Sparse Vectors, Qdrant | | [Use semantic search to navigate your codebase](./code-search/) | Implement semantic search application for code search task | Qdrant, Python, sentence-transformers, Jina | +| [Agentic-langGraph-RAG Tutorial](./Agentic-langGraph-RAG/Agentic_PDF_RAG.ipynb) | Tutorial for Agentic RAG using LangGraph and Qdrant | LangGraph, Qdrant, GPT-4o, RAG | diff --git a/qdrant_101_audio_data/README.md b/qdrant_101_audio_data/README.md index 2abe6f8..84272cd 100644 --- a/qdrant_101_audio_data/README.md +++ b/qdrant_101_audio_data/README.md @@ -1,6 +1,6 @@ # Qdrant & Audio Data -![main](../images/main_pic.png) +![main](./img/main_pic.png) Welcome to this tutorial on vector databases and music recommendation systems using Python and Qdrant. Here, we will learn about how to get started with audio data, embeddings and vector databases. diff --git a/qdrant_101_text_data/README.md b/qdrant_101_text_data/README.md index 01a5034..f54f2b0 100644 --- a/qdrant_101_text_data/README.md +++ b/qdrant_101_text_data/README.md @@ -1,6 +1,6 @@ # Qdrant & Text Data -![qdrant](../images/crab_nlp.png) +![qdrant](./img/crab_nlp.png) This tutorial will show you how to use Qdrant to develop a semantic search service. At its core, this service will harness Natural Language Processing (NLP) methods and use Qdrant's API to store, search, and manage vectors with an additional payload.