diff --git a/src/TransformerVCA/VCA_TRANSFORMERv10_7.ipynb b/src/TransformerVCA/VCA_TRANSFORMERv10_7.ipynb
index fdffb84..6a30217 100644
--- a/src/TransformerVCA/VCA_TRANSFORMERv10_7.ipynb
+++ b/src/TransformerVCA/VCA_TRANSFORMERv10_7.ipynb
@@ -1,8 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e8bd8323",
+   "metadata": {},
+   "source": []
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "cbfcbfb3",
    "metadata": {},
    "outputs": [],
@@ -12,58 +18,117 @@
     "\n",
     "class SimpleConfig:\n",
     "    def __init__(self, graph_path: str = \"\"):\n",
+    "        \"\"\"\n",
+    "        Initialize a configuration object for graph-based or physics-inspired models.\n",
+    "        If a valid graph file path is provided, the graph will be loaded from that file.\n",
+    "        Otherwise, a random symmetric coupling matrix (representing a random graph)\n",
+    "        will be generated.\n",
+    "\n",
+    "        Parameters:\n",
+    "            graph_path (str): Optional path to a text file defining the graph structure.\n",
+    "        \"\"\"\n",
+    "\n",
     "        if graph_path and os.path.exists(graph_path):\n",
-    "            print(\"using\",graph_path)\n",
+    "            print(\"using\", graph_path)\n",
+    "            # Load the coupling matrix (Jz) and edge list (graph_list) from a file\n",
     "            self.Jz, self.graph_list = self.from_txt_load_graph(graph_path)\n",
+    "            # Derive the number of nodes based on the loaded edge list\n",
     "            self.num_nodes = self.obtain_num_nodes(self.graph_list)\n",
     "        else:\n",
     "            print(\"random graph\")\n",
-    "            self.num_nodes = 20\n",
+    "            # If no file is given or file not found, create a random graph\n",
+    "            self.num_nodes = 20  # Default number of nodes\n",
+    "            # Generate a random symmetric matrix with entries ~ N(0, 1/sqrt(N))\n",
+    "            # This scaling helps maintain numerical stability as N increases\n",
     "            self.Jz = np.random.normal(0, 1/np.sqrt(self.num_nodes), size=(self.num_nodes, self.num_nodes))\n",
+    "            # Make sure the matrix is symmetric, since the graph is undirected\n",
     "            self.Jz = (self.Jz + self.Jz.T) / 2\n",
+    "            # Empty list — no explicit edge data since it’s random\n",
     "            self.graph_list = []\n",
     "\n",
+    "        # Random seed for reproducibility\n",
     "        self.seed = 1\n",
     "\n",
-    "        self.d_model = 16\n",
-    "        self.num_heads = 1\n",
-    "        self.num_layers = 1\n",
-    "        self.dff = 64\n",
-    "        \n",
-    "        self.numsamples = 64\n",
-    "        self.lr = 5*(1e-4)\n",
-    "        self.T0 = 1.0\n",
-    "        self.Bx0 = 0\n",
-    "        self.num_warmup_steps = 100\n",
-    "        self.num_annealing_steps = 20\n",
-    "        self.num_equilibrium_steps = 10\n",
+    "        # Transformer model hyperparameters (for possible machine learning applications)\n",
+    "        self.d_model = 16       # Hidden dimension size of the model\n",
+    "        self.num_heads = 1      # Number of attention heads in the transformer\n",
+    "        self.num_layers = 1     # Number of transformer layers\n",
+    "        self.dff = 64           # Dimension of the feedforward network inside the transformer\n",
+    "\n",
+    "        # Simulation or training parameters\n",
+    "        self.numsamples = 64            # Number of Monte Carlo or training samples\n",
+    "        self.lr = 5 * (1e-4)            # Learning rate for optimizer\n",
+    "        self.T0 = 1.0                   # Initial temperature (often used in annealing)\n",
+    "        self.Bx0 = 0                    # Initial transverse field (for quantum models)\n",
+    "        self.num_warmup_steps = 100     # Steps before starting annealing/sampling\n",
+    "        self.num_annealing_steps = 20   # Number of annealing (temperature reduction) steps\n",
+    "        self.num_equilibrium_steps = 10 # Steps for reaching equilibrium per temperature\n",
     "\n",
     "    def from_txt_load_graph(self, graph_path: str):\n",
+    "        \"\"\"\n",
+    "        Load a graph structure from a text file.\n",
+    "\n",
+    "        Expected file format:\n",
+    "            Line 1: <num_spins> <num_couplings>\n",
+    "            Following lines: <spin1> <spin2> <weight>\n",
+    "            - Spins are 1-indexed in the file and will be converted to 0-indexed internally.\n",
+    "            - Each line represents an undirected weighted edge between two spins/nodes.\n",
+    "\n",
+    "        Returns:\n",
+    "            Jz (np.ndarray): A symmetric NxN coupling (adjacency) matrix.\n",
+    "            graph_list (list): A list of tuples (node1, node2, weight).\n",
+    "        \"\"\"\n",
     "        with open(graph_path, \"r\") as file:\n",
+    "            # First line defines the number of spins (nodes) and couplings (edges)\n",
     "            spins, couplings = file.readline().split(\" \")\n",
     "            N = int(spins)\n",
+    "            # Initialize an empty NxN matrix for edge weights\n",
     "            Jz = np.zeros((N, N), dtype=np.float64)\n",
     "            graph_list = []\n",
+    "\n",
+    "            # Read edge data line by line\n",
     "            line = file.readline()\n",
     "            while line and line.strip():\n",
     "                spin1, spin2, weight = line.split(\" \")\n",
+    "                # Convert to 0-based indices for internal storage\n",
     "                idx1, idx2 = int(spin1) - 1, int(spin2) - 1\n",
     "                weight = float(weight)\n",
+    "                # Store the weight symmetrically since the graph is undirected\n",
     "                Jz[idx1, idx2] = weight\n",
     "                Jz[idx2, idx1] = weight\n",
+    "                # Append edge information to the list\n",
     "                graph_list.append((idx1, idx2, weight))\n",
+    "                # Read next line\n",
     "                line = file.readline()\n",
+    "\n",
     "        return Jz, graph_list\n",
     "\n",
     "    def obtain_num_nodes(self, graph_list):\n",
+    "        \"\"\"\n",
+    "        Determine the number of nodes in a graph based on its edge list.\n",
+    "\n",
+    "        Parameters:\n",
+    "            graph_list (list): List of edges, each represented as (node1, node2, weight).\n",
+    "\n",
+    "        Returns:\n",
+    "            int: The total number of distinct nodes.\n",
+    "        \"\"\"\n",
     "        if not graph_list:\n",
     "            return 0\n",
+    "        # Find the maximum node index appearing in any edge, then add 1\n",
+    "        # because nodes are zero-indexed internally\n",
     "        return max([max(n0, n1) for n0, n1, w in graph_list]) + 1\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "d5bb4545",
+   "metadata": {},
+   "source": []
+  },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "1c2bf58c",
    "metadata": {},
    "outputs": [],
@@ -75,22 +140,38 @@
     "\n",
     "class TransformerWavefunction(nn.Module):\n",
     "    def __init__(self, config):\n",
+    "        \"\"\"\n",
+    "        A Transformer-based autoregressive wavefunction model.\n",
+    "        Designed to represent probability distributions over spin configurations,\n",
+    "        similar to models used in quantum Monte Carlo or generative physical systems.\n",
+    "\n",
+    "        Args:\n",
+    "            config: A configuration object (e.g., SimpleConfig) containing model hyperparameters.\n",
+    "        \"\"\"\n",
     "        super().__init__()\n",
-    "        self.N = config.num_nodes\n",
-    "        self.d_model = config.d_model\n",
+    "        self.N = config.num_nodes        # Number of spins/nodes in the system\n",
+    "        self.d_model = config.d_model    # Hidden dimension of the Transformer\n",
     "        self.num_heads = config.num_heads\n",
     "        self.num_layers = config.num_layers\n",
-    "        self.dff = config.dff\n",
+    "        self.dff = config.dff            # Feedforward layer size\n",
+    "\n",
+    "        # ===== Embedding layer setup =====\n",
+    "        # We use an embedding for 3 tokens:\n",
+    "        #   0 -> spin down, 1 -> spin up, 2 -> <sos> (start-of-sequence token)\n",
+    "        self.embedding = nn.Embedding(3, self.d_model)\n",
     "\n",
-    "        # ===== 关键约束：更“温和”的表示 =====\n",
-    "        self.embedding = nn.Embedding(3, self.d_model)        # 0/1=spin, 2=<sos>\n",
-    "        import math\n",
+    "        # Positional embeddings (added to token embeddings)\n",
+    "        # Randomly initialized and scaled by sqrt(d_model) for stability\n",
     "        self.pos_embedding = nn.Parameter(\n",
     "            torch.randn(1, self.N + 1, self.d_model) / math.sqrt(self.d_model)\n",
     "        )\n",
+    "\n",
+    "        # Layer normalization for stabilizing embedding outputs\n",
     "        self.emb_ln = nn.LayerNorm(self.d_model)\n",
     "\n",
-    "        # 更稳定的 EncoderLayer：pre-LN + dropout\n",
+    "        # ===== Transformer Encoder =====\n",
+    "        # Using a pre-LayerNorm TransformerEncoderLayer (norm_first=True)\n",
+    "        # Pre-LN is more stable for deep or autoregressive settings.\n",
     "        decoder_layer = nn.TransformerEncoderLayer(\n",
     "            d_model=self.d_model,\n",
     "            nhead=self.num_heads,\n",
@@ -98,89 +179,138 @@
     "            dropout=0.0,\n",
     "            activation=\"relu\",\n",
     "            batch_first=True,\n",
-    "            norm_first=True  # pre-LN（若你环境不支持，可删除该参数）\n",
+    "            norm_first=True  # Use pre-LN (set to False or remove if unsupported)\n",
     "        )\n",
     "        self.transformer = nn.TransformerEncoder(decoder_layer, num_layers=self.num_layers)\n",
     "\n",
+    "        # Final linear layer to map Transformer outputs to logits over spin values (0/1)\n",
     "        self.fc_out = nn.Linear(self.d_model, 2)\n",
     "\n",
-    "        # 输出温度（缩放 logits，抑制过尖）；可设为 nn.Parameter 让其可学\n",
-    "        self.out_tau = 1.2  # 1.0~1.5 之间比较稳\n",
+    "        # Output temperature scaling (helps prevent overly sharp probability distributions)\n",
+    "        # This can also be defined as a learnable parameter if needed.\n",
+    "        self.out_tau = 1.2  # Recommended range: 1.0–1.5\n",
     "\n",
     "    @staticmethod\n",
     "    def _causal_mask(L, device):\n",
-    "        # 上三角 True 表示 \"被mask\"（不看未来）\n",
+    "        \"\"\"\n",
+    "        Create a causal attention mask of size (L, L).\n",
+    "        Entries above the diagonal are True (masked out),\n",
+    "        ensuring that each token only attends to past tokens.\n",
+    "        \"\"\"\n",
     "        return torch.triu(torch.ones(L, L, device=device, dtype=torch.bool), diagonal=1)\n",
     "\n",
     "    def forward(self, tokens, use_out_tau=True):\n",
     "        \"\"\"\n",
-    "        tokens: (B, T) 其中 T<=N+1, 第一个 token 可为 <sos>(2)\n",
-    "        返回：probs (B, T, 2)\n",
+    "        Forward pass of the Transformer.\n",
+    "\n",
+    "        Args:\n",
+    "            tokens (torch.LongTensor): Input sequence of shape (B, T), where\n",
+    "                T <= N+1. The first token may be the <sos> token (value=2).\n",
+    "            use_out_tau (bool): Whether to apply output temperature scaling.\n",
+    "\n",
+    "        Returns:\n",
+    "            probs (torch.FloatTensor): Shape (B, T, 2), probabilities for spin values (0/1)\n",
     "        \"\"\"\n",
+    "        # Token embeddings + positional embeddings\n",
     "        x = self.embedding(tokens) + self.pos_embedding[:, :tokens.size(1), :]\n",
     "        x = self.emb_ln(x)\n",
     "\n",
+    "        # Apply causal attention mask to prevent information leakage from future steps\n",
     "        mask = self._causal_mask(tokens.size(1), tokens.device)\n",
+    "\n",
+    "        # Pass through Transformer encoder\n",
     "        h = self.transformer(x, mask=mask)\n",
+    "\n",
+    "        # Map final hidden states to logits for 2 spin classes\n",
     "        logits = self.fc_out(h)\n",
     "\n",
+    "        # Apply temperature scaling to logits for smoother probabilities\n",
     "        if use_out_tau and self.out_tau != 1.0:\n",
     "            logits = logits / self.out_tau\n",
     "\n",
+    "        # Convert logits to probabilities\n",
     "        probs = F.softmax(logits, dim=-1)\n",
-    "        return probs  # (B, T, 2)\n",
+    "        return probs  # Shape: (B, T, 2)\n",
     "\n",
     "    def log_probability(self, spins):\n",
     "        \"\"\"\n",
-    "        spins: (B, N) in {0,1}\n",
-    "        计算 log P(s1...sN)；内部会前置 <sos>\n",
+    "        Compute the log-probability of given spin configurations.\n",
+    "\n",
+    "        Args:\n",
+    "            spins (torch.LongTensor): Tensor of shape (B, N), each element ∈ {0,1}\n",
+    "\n",
+    "        Returns:\n",
+    "            logp (torch.FloatTensor): Log-probabilities for each configuration (B,)\n",
     "        \"\"\"\n",
     "        B = spins.size(0)\n",
     "        device = spins.device\n",
+    "\n",
+    "        # Prepend <sos> token (start-of-sequence)\n",
     "        sos = torch.full((B, 1), 2, dtype=torch.long, device=device)\n",
-    "        tokens = torch.cat([sos, spins], dim=1)  # (B, N+1)\n",
+    "        tokens = torch.cat([sos, spins], dim=1)  # Shape: (B, N+1)\n",
     "\n",
-    "        probs = self.forward(tokens)[:, :-1, :] \n",
+    "        # Forward pass through the model (excluding last prediction)\n",
+    "        probs = self.forward(tokens)[:, :-1, :]  # (B, N, 2)\n",
+    "\n",
+    "        # Create one-hot encoding for actual spins and select their probabilities\n",
     "        one_hot = F.one_hot(spins, num_classes=2).float()\n",
-    "        sel = (probs * one_hot).sum(dim=-1)      # (B, N)\n",
+    "        sel = (probs * one_hot).sum(dim=-1)  # Select the probability for each true spin value\n",
+    "\n",
+    "        # Compute total log-probability\n",
     "        logp = torch.log(sel.clamp_min(1e-12)).sum(dim=-1)  # (B,)\n",
     "        return logp\n",
     "\n",
     "    @torch.no_grad()\n",
     "    def sample(self, numsamples, tau_sample=1.0, epsilon=0.0):\n",
     "        \"\"\"\n",
-    "        自回归采样，支持采样温度与 ε-探索\n",
-    "        返回：samples(B,N), logp(B)\n",
+    "        Autoregressive sampling of spin configurations.\n",
+    "\n",
+    "        Args:\n",
+    "            numsamples (int): Number of samples to generate.\n",
+    "            tau_sample (float): Sampling temperature; <1.0 makes samples sharper, >1.0 softer.\n",
+    "            epsilon (float): Epsilon-exploration rate; mixes in uniform noise to avoid collapse.\n",
+    "\n",
+    "        Returns:\n",
+    "            spins (torch.LongTensor): Sampled spin configurations of shape (B, N)\n",
+    "            logp (torch.FloatTensor): Log-probabilities of each sample (B,)\n",
     "        \"\"\"\n",
     "        device = next(self.parameters()).device\n",
     "        B = numsamples\n",
     "\n",
-    "        tokens = torch.full((B, 1), 2, dtype=torch.long, device=device)  # <sos>\n",
+    "        # Start each sequence with <sos> token\n",
+    "        tokens = torch.full((B, 1), 2, dtype=torch.long, device=device)\n",
     "        logp_parts = []\n",
     "\n",
+    "        # Sequentially generate spins one by one\n",
     "        for n in range(self.N):\n",
-    "            probs = self.forward(tokens, use_out_tau=True)[:, -1, :]  # (B,2)\n",
-    "            # 采样温度：对logits等价，放在概率上可用幂律近似\n",
+    "            # Compute probability distribution for the next spin\n",
+    "            probs = self.forward(tokens, use_out_tau=True)[:, -1, :]  # (B, 2)\n",
+    "\n",
+    "            # Apply temperature scaling to sampling probabilities\n",
     "            if tau_sample != 1.0:\n",
     "                probs = F.softmax((probs.clamp_min(1e-20)).log() / tau_sample, dim=-1)\n",
     "\n",
-    "            # ε-探索：掺入均匀分布，避免过早确定化\n",
+    "            # Epsilon-exploration: mix in a uniform distribution to encourage diversity\n",
     "            if epsilon > 0.0:\n",
     "                probs = (1.0 - epsilon) * probs + epsilon * 0.5\n",
     "\n",
+    "            # Sample next spin from categorical distribution\n",
     "            dist = torch.distributions.Categorical(probs)\n",
-    "            s = dist.sample()                      # (B,)\n",
+    "            s = dist.sample()  # (B,)\n",
     "            tokens = torch.cat([tokens, s[:, None]], dim=1)\n",
+    "\n",
+    "            # Record log-probabilities\n",
     "            logp_parts.append(dist.log_prob(s))\n",
     "\n",
-    "        spins = tokens[:, 1:]                      # 去掉 <sos>\n",
+    "        # Remove the <sos> token from the final sequence\n",
+    "        spins = tokens[:, 1:]\n",
     "        logp = torch.stack(logp_parts, dim=1).sum(dim=1)\n",
     "        return spins, logp\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "878c7cb7",
    "metadata": {},
    "outputs": [],
@@ -190,60 +320,124 @@
     "import random\n",
     "\n",
     "def local_energy_SK(Jz, samples):\n",
-    "    # samples: (B, N) in {0,1} -> {-1,+1}\n",
-    "    spins = (2 * samples.cpu().numpy()) - 1\n",
-    "    E = -0.5 * np.einsum(\"bi,ij,bj->b\", spins, Jz, spins)  # -1/2 Σ_ij J_ij s_i s_j\n",
+    "    \"\"\"\n",
+    "    Compute the classical SK (Sherrington-Kirkpatrick) Hamiltonian energy for batches of spin samples.\n",
+    "\n",
+    "    Args:\n",
+    "        Jz (np.ndarray): (N, N) symmetric coupling matrix.\n",
+    "        samples (torch.LongTensor or torch.Tensor): shape (B, N), values in {0,1}.\n",
+    "\n",
+    "    Returns:\n",
+    "        torch.Tensor: shape (B,), energy values E = -1/2 sum_ij J_ij s_i s_j where s in {-1,+1}.\n",
+    "    \"\"\"\n",
+    "    # Convert {0,1} -> {-1,+1} in numpy for fast einsum calculation\n",
+    "    spins = (2 * samples.cpu().numpy()) - 1  # shape (B, N), dtype=int\n",
+    "    # E = -1/2 * sum_{i,j} J_ij s_i s_j  computed per batch element\n",
+    "    E = -0.5 * np.einsum(\"bi,ij,bj->b\", spins, Jz, spins)\n",
+    "    # Return as torch tensor on the original device, dtype float64 for numerical precision\n",
     "    return torch.tensor(E, dtype=torch.float64, device=samples.device)\n",
     "\n",
+    "\n",
     "def train_wavefunction(config, model_class, device=\"cpu\"):\n",
+    "    \"\"\"\n",
+    "    Train an autoregressive Transformer wavefunction model using a variational free-energy objective\n",
+    "    with a temperature schedule (annealing). The function runs until the annealing schedule completes,\n",
+    "    then performs a large-sample inference to estimate final energies.\n",
     "\n",
+    "    Args:\n",
+    "        config: configuration object (expects attributes used elsewhere: seed, numsamples, lr, Jz, etc).\n",
+    "        model_class: a callable (class) returning an nn.Module instance (e.g., TransformerWavefunction).\n",
+    "        device (str or torch.device): device for model and tensors.\n",
+    "\n",
+    "    Returns:\n",
+    "        If training completes normally, returns (mean_energy, min_energy, energies_tensor) at final inference step.\n",
+    "        If training not reaching the end, returns the trained model object (last line).\n",
+    "    \"\"\"\n",
+    "    # -------------------------\n",
+    "    # Reproducibility: set RNG seeds\n",
+    "    # -------------------------\n",
     "    seed = config.seed\n",
     "    random.seed(seed)\n",
     "    np.random.seed(seed)\n",
-    "    torch.manual_seed(seed) \n",
+    "    torch.manual_seed(seed)\n",
     "\n",
+    "    # -------------------------\n",
+    "    # Model, optimizer, scheduler\n",
+    "    # -------------------------\n",
     "    model = model_class(config).to(device)\n",
     "    opt = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=0.0)\n",
-    "    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=(\n",
-    "        config.num_warmup_steps + config.num_annealing_steps * config.num_equilibrium_steps\n",
-    "    ), eta_min=config.lr * 0.1)\n",
     "\n",
-    "    # Initialize Temperature and Steps\n",
+    "    # CosineAnnealingLR used to slowly change LR; T_max set to total number of steps\n",
+    "    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(\n",
+    "        opt,\n",
+    "        T_max=(\n",
+    "            config.num_warmup_steps + config.num_annealing_steps * config.num_equilibrium_steps\n",
+    "        ),\n",
+    "        eta_min=config.lr * 0.1\n",
+    "    )\n",
+    "\n",
+    "    # -------------------------\n",
+    "    # Initialize temperature and field and compute total steps\n",
+    "    # -------------------------\n",
     "    T, Bx = config.T0, config.Bx0\n",
     "    num_steps = config.num_warmup_steps + (config.num_annealing_steps * config.num_equilibrium_steps)\n",
+    "\n",
     "    start = time.time()\n",
     "    for it in range(num_steps + 1):\n",
     "\n",
-    "        # Linear Temperature Schedule\n",
-    "        if it >= config.num_warmup_steps and it % config.num_equilibrium_steps == 0 and it <= config.num_annealing_steps*config.num_equilibrium_steps + config.num_warmup_steps:\n",
+    "        # -------------------------\n",
+    "        # Annealing schedule (linear piecewise): after warmup, every `num_equilibrium_steps` reduce T\n",
+    "        # -------------------------\n",
+    "        if (it >= config.num_warmup_steps\n",
+    "            and it % config.num_equilibrium_steps == 0\n",
+    "            and it <= config.num_annealing_steps * config.num_equilibrium_steps + config.num_warmup_steps):\n",
     "            anneal_k = (it - config.num_warmup_steps) / config.num_equilibrium_steps\n",
+    "            # Linearly decrease from T0 -> 0 across `num_annealing_steps`\n",
     "            T  = config.T0 * (1 - anneal_k / config.num_annealing_steps)\n",
+    "            # Bx is similarly annealed (here initial Bx0 may be 0)\n",
     "            Bx = config.Bx0 * (1 - anneal_k / config.num_annealing_steps)\n",
     "            print(f\"\\nAnnealing Step: {anneal_k}/{config.num_annealing_steps}\")\n",
     "\n",
-    "        # Sample SIGMA and Log Probabilities\n",
+    "        # -------------------------\n",
+    "        # Sampling: draw configurations from current model (no grads)\n",
+    "        # -------------------------\n",
     "        with torch.no_grad():\n",
+    "            # model.sample returns spins (B,N) in {0,1} and log probability of sequence\n",
     "            spins, logp_seq = model.sample(config.numsamples)\n",
-    "        \n",
-    "        # Calculate Local Energy\n",
+    "\n",
+    "        # -------------------------\n",
+    "        # Compute local energy for each sampled configuration\n",
+    "        # local_energy_SK returns a torch tensor (B,) in dtype float64\n",
+    "        # -------------------------\n",
     "        E_local = local_energy_SK(config.Jz, spins).to(torch.float64)     # (B,)\n",
     "        logp_seq = logp_seq.to(torch.float64)                             # (B,)\n",
     "\n",
-    "        # Print Statistics: Energy, Free Energy, Variance\n",
-    "        free_local_energy = (E_local + T*logp_seq).detach()\n",
-    "        if it%config.num_equilibrium_steps==0:\n",
-    "            print(f'mean(E): {E_local.mean().item()}, mean(F): {free_local_energy.mean().item()}, var(E): {E_local.var().item()}, var(F): {free_local_energy.var().item()}, #samples {config.numsamples}, #Training step {it}')\n",
+    "        # -------------------------\n",
+    "        # Monitoring: \"free local energy\" = E + T * log P\n",
+    "        # In statistical mechanics, the free energy under a distribution with temperature T is E + T log P.\n",
+    "        # -------------------------\n",
+    "        free_local_energy = (E_local + T * logp_seq).detach()\n",
+    "        if it % config.num_equilibrium_steps == 0:\n",
+    "            # Print running statistics: mean energy, mean free energy, variances, and temperature / field\n",
+    "            print(\n",
+    "                f'mean(E): {E_local.mean().item()}, mean(F): {free_local_energy.mean().item()}, '\n",
+    "                f'var(E): {E_local.var().item()}, var(F): {free_local_energy.var().item()}, '\n",
+    "                f'#samples {config.numsamples}, #Training step {it}'\n",
+    "            )\n",
     "            print(\"Temperature: \", T)\n",
     "            print(\"Magnetic field: \", Bx)\n",
     "\n",
-    "        # Inference at the end of Annealing\n",
-    "        if it == config.num_annealing_steps*config.num_equilibrium_steps + config.num_warmup_steps:\n",
+    "        # -------------------------\n",
+    "        # Final inference once annealing completes:\n",
+    "        # take a large number of samples to estimate mean / variance / min energies.\n",
+    "        # -------------------------\n",
+    "        if it == config.num_annealing_steps * config.num_equilibrium_steps + config.num_warmup_steps:\n",
     "\n",
-    "            numsamples_inference = 10**4 # Total number of samples at the end of annealing \n",
-    "            Nsteps = 20 # number of steps to sample numsamples_inference\n",
-    "            numsamples_perstep = numsamples_inference//Nsteps\n",
+    "            numsamples_inference = 10**4  # total number of samples to draw for final evaluation\n",
+    "            Nsteps = 20                   # split into 20 batches to avoid memory blowup\n",
+    "            numsamples_perstep = numsamples_inference // Nsteps\n",
     "            energies = torch.zeros((numsamples_inference))\n",
-    "            # solutions = torch.zeros((numsamples_inference, config.N))\n",
+    "            # solutions = torch.zeros((numsamples_inference, config.N))  # optionally save spin configurations\n",
     "\n",
     "            print(\"\\nSaving energy and variance before the end of annealing\")\n",
     "            for i in range(Nsteps):\n",
@@ -256,23 +450,45 @@
     "            print(\"varE = \", energies.var().item())\n",
     "            print(\"minE = \", energies.min().item())\n",
     "\n",
+    "            # Return final aggregated statistics and the raw energies tensor\n",
     "            return energies.mean().item(), energies.min().item(), energies\n",
     "\n",
-    "        logp_for_grad = model.log_probability(spins).to(torch.float64)\n",
-    "        Floc = (E_local + T*logp_for_grad).detach()\n",
+    "        # -------------------------\n",
+    "        # Compute log-probabilities for gradient update and the variational objective\n",
+    "        # The gradient estimator used is the classic REINFORCE-like / score-function estimator\n",
+    "        # with a covariance baseline: loss = Cov(F_loc, log p) = E[F log p] - E[F] E[log p].\n",
+    "        #\n",
+    "        # Explanation:\n",
+    "        # We want to minimize the expected free energy E_p[F] where F = E_local + T * log p.\n",
+    "        # Taking gradient wrt model parameters θ gives E_p[(F - baseline) * ∇θ log p].\n",
+    "        # Using the sample-based covariance form yields an unbiased estimator for that gradient up\n",
+    "        # to sampling error; it also centers both terms to reduce variance.\n",
+    "        # -------------------------\n",
+    "        logp_for_grad = model.log_probability(spins).to(torch.float64)  # (B,)\n",
+    "        Floc = (E_local + T * logp_for_grad).detach()                    # stop gradient through Floc\n",
+    "\n",
+    "        # Covariance-based loss whose gradient approximates E[(F - E[F]) ∇ log p]\n",
     "        loss = torch.mean(Floc * logp_for_grad) - torch.mean(logp_for_grad) * torch.mean(Floc)\n",
     "\n",
+    "        # -------------------------\n",
+    "        # Backprop and optimizer step\n",
+    "        # -------------------------\n",
     "        opt.zero_grad()\n",
     "        loss.backward()\n",
+    "        # Optionally clip gradients: uncomment next line if exploding grads occur\n",
     "        # total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
-    "        opt.step(); scheduler.step()\n",
+    "        opt.step()\n",
+    "        scheduler.step()\n",
     "\n",
-    "        # ---- 打印 ----\n",
+    "        # -------------------------\n",
+    "        # Additional logging for debugging/tracking convergence\n",
+    "        # -------------------------\n",
     "        if it % config.num_equilibrium_steps == 0:\n",
     "            print(f\"Grad Log Prob: {logp_for_grad.mean().item()}\")\n",
-    "            print(f\"T*LogProb: {(T*logp_for_grad).mean().item()}\")\n",
-    "            print(f\"Elapsed time = {time.time()-start:.2f} seconds\\n\")\n",
+    "            print(f\"T*LogProb: {(T * logp_for_grad).mean().item()}\")\n",
+    "            print(f\"Elapsed time = {time.time() - start:.2f} seconds\\n\")\n",
     "\n",
+    "    # If loop finishes without hitting the inference return, return the trained model\n",
     "    return model\n"
    ]
   },
diff --git a/src/algorithms/mcpg/ising_mcpg_single_file.py b/src/algorithms/mcpg/ising_mcpg_single_file.py
new file mode 100644
index 0000000..732dd33
--- /dev/null
+++ b/src/algorithms/mcpg/ising_mcpg_single_file.py
@@ -0,0 +1,963 @@
+import torch
+from torch_scatter import scatter
+from torch.distributions.bernoulli import Bernoulli
+from torch_geometric.data import Data
+import yaml
+import argparse
+import time
+
+'''
+Sampler
+'''
+def sample_initializer(problem_type, probs, config,
+                       device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                       data=None):
+    """
+    Initialize MCMC samples for different problem types.
+
+    Parameters
+    ----------
+    problem_type : str
+        Type of optimization problem (e.g. 'r_cheegercut', 'n_cheegercut').
+    probs : torch.Tensor
+        Bernoulli probabilities used for random initialization.
+    config : dict
+        Configuration dictionary containing sampling parameters such as
+        'total_mcmc_num' and 'repeat_times'.
+    device : torch.device
+        Device on which tensors are allocated (CPU or GPU).
+    data : object, optional
+        Graph-related data structure containing node information, such as
+        number of nodes and degree-based ordering.
+
+    Returns
+    -------
+    torch.Tensor
+        Initialized samples with shape (num_nodes, num_samples).
+    """
+
+    # Special initialization strategy for Cheeger cut problems
+    if problem_type in ["r_cheegercut", "n_cheegercut"]:
+        # Initialize all samples as zero vectors
+        samples = torch.zeros(config['total_mcmc_num'], data.num_nodes)
+
+        # Select nodes with the largest degrees
+        index = data.sorted_degree_nodes[-config['total_mcmc_num']:]
+
+        # Activate one node per sample according to degree ranking
+        for i in range(config['total_mcmc_num']):
+            samples[i][index[i]] = 1
+
+        # Repeat samples to match the required number of MCMC chains
+        samples = samples.repeat(config['repeat_times'], 1)
+
+        # Transpose to shape (num_nodes, num_samples)
+        return samples.t()
+
+    # Default random initialization using Bernoulli distribution
+    m = Bernoulli(probs)
+
+    # Sample binary states for all MCMC chains
+    samples = m.sample([config['total_mcmc_num'] * config['repeat_times']])
+
+    # Detach from computation graph and move to target device
+    samples = samples.detach().to(device)
+
+    # Transpose to shape (num_nodes, num_samples)
+    return samples.t()
+
+def metro_sampling(probs, start_status, max_transfer_time,
+                   device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    """
+    Perform Metropolis-style sampling for multiple MCMC chains.
+
+    Parameters
+    ----------
+    probs : torch.Tensor
+        Probability vector for each node (Bernoulli parameters), shape (num_nodes,).
+    start_status : torch.Tensor
+        Initial states of all chains, shape (num_nodes, num_chains).
+    max_transfer_time : int
+        Maximum number of accepted state transitions per chain.
+    device : torch.device
+        Device on which computation is performed (CPU or GPU).
+
+    Returns
+    -------
+    torch.Tensor
+        Final sampled states, shape (num_nodes, num_chains), with float values {0, 1}.
+    """
+
+    # Number of nodes in the graph
+    num_node = len(probs)
+
+    # Number of parallel MCMC chains
+    num_chain = start_status.shape[1]
+
+    # Column indices corresponding to different chains
+    index_col = torch.tensor(list(range(num_chain))).to(device)
+
+    # Move probability vector and initial samples to the target device
+    probs = probs.detach().to(device)
+    samples = start_status.bool().to(device)
+
+    # Count the number of accepted transitions
+    count = 0
+
+    # Perform Metropolis updates (upper bounded by max_transfer_time * 5 trials)
+    for t in range(max_transfer_time * 5):
+        # Stop if total accepted moves reach the desired amount
+        if count >= num_chain * max_transfer_time:
+            break
+
+        # Randomly select one node to update for each chain
+        index_row = torch.randint(
+            low=0, high=num_node, size=[num_chain], device=device
+        )
+
+        # Extract the Bernoulli probabilities for selected nodes
+        chosen_probs_base = probs[index_row]
+
+        # Current states of the selected nodes in each chain
+        chosen_value = samples[index_row, index_col]
+
+        # Probability of the current state under Bernoulli distribution
+        chosen_probs = torch.where(
+            chosen_value, chosen_probs_base, 1 - chosen_probs_base
+        )
+
+        # Metropolis acceptance ratio
+        accept_rate = (1 - chosen_probs) / chosen_probs
+
+        # Draw uniform random numbers for acceptance decision
+        r = torch.rand(num_chain, device=device)
+        is_accept = (r < accept_rate)
+
+        # Flip the selected bits if the proposal is accepted
+        samples[index_row, index_col] = torch.where(
+            is_accept, ~chosen_value, chosen_value
+        )
+
+        # Update accepted move counter
+        count += is_accept.sum()
+
+    # Return samples as float tensor
+    return samples.float().to(device)
+
+def mcpg_sampling_ising(
+    data,
+    start_result, probs,
+    num_ls, change_times, total_mcmc_num,
+    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+):
+    """
+    Monte Carlo Policy Gradient (MCPG) sampling procedure for Ising models.
+
+    This function performs:
+      1) Metropolis sampling for initialization,
+      2) Local search refinement on graph cuts,
+      3) Energy evaluation under Ising Hamiltonian,
+      4) Advantage computation for policy gradient training.
+
+    Parameters
+    ----------
+    data : object
+        Graph data object containing:
+        - edge_index, edge_attr, num_edges
+        - sorted_degree_nodes, sorted_degree_edges
+        - auxiliary edge/node indexing tensors
+    start_result : torch.Tensor
+        Initial spin configurations, shape (num_nodes, num_chains).
+    probs : torch.Tensor
+        Bernoulli probabilities for each node, shape (num_nodes,).
+    num_ls : int
+        Number of local search iterations.
+    change_times : int
+        Maximum accepted transitions for Metropolis sampling.
+    total_mcmc_num : int
+        Number of parallel MCMC chains per repeat group.
+    device : torch.device
+        Device used for computation.
+
+    Returns
+    -------
+    temp_min : torch.Tensor
+        Minimum energy found for each MCMC group, shape (total_mcmc_num,).
+    temp_min_info : torch.Tensor
+        Spin configurations corresponding to minimum energies,
+        shape (num_nodes, total_mcmc_num).
+    start_samples : torch.Tensor
+        Initial samples after Metropolis sampling, shape (num_nodes, num_chains).
+    adv : torch.Tensor
+        Advantage values for policy gradient, shape (num_chains,).
+    """
+
+    # Move probabilities to CPU for Metropolis sampling stability
+    probs = probs.to(torch.device("cpu"))
+
+    # Graph structure information
+    num_edges = data.num_edges
+    edges = data.edge_index
+    nlr_graph = edges[0]   # source nodes of edges
+    nlc_graph = edges[1]   # target nodes of edges
+    edge_weight = data.edge_attr
+    edge_weight_sum = data.edge_weight_sum
+
+    # Clone initial configurations
+    graph_probs = start_result.clone()
+
+    # -----------------------------
+    # Metropolis sampling phase
+    # -----------------------------
+    graph_probs = metro_sampling(
+        probs, graph_probs, change_times, device
+    )
+    start = graph_probs.clone()
+
+    # Apply a global spin flip based on highest-degree node
+    temp = graph_probs[data.sorted_degree_nodes[0]].clone()
+    graph_probs += temp
+    graph_probs = graph_probs % 2
+
+    # Map binary states to {0, 1} with small numerical adjustment
+    graph_probs = (graph_probs - 0.5) * 2 + 0.5
+
+    # -----------------------------
+    # Local search phase
+    # -----------------------------
+    temp = torch.zeros(4, graph_probs.size(dim=1)).to(device)
+    expected_cut = torch.zeros(graph_probs.size(dim=1))
+    cnt = 0
+
+    while True:
+        cnt += 1
+
+        # Iterate over edges ordered by degree
+        for i in range(num_edges):
+            index = data.sorted_degree_edges[i]
+
+            # Edge endpoints
+            node_r = nlr_graph[index]
+            node_c = nlc_graph[index]
+
+            # Incident edge structures
+            edges_r = data.n0_edges[index]
+            edges_c = data.n1_edges[index]
+
+            # Precomputed additive constants
+            add_0 = data.add[0][index]
+            add_1 = data.add[1][index]
+            add_2 = data.add[2][index]
+
+            # Compute local contributions for both endpoints
+            temp_r_v = torch.mm(edges_r, graph_probs[data.n0[index]])
+            temp_c_v = torch.mm(edges_c, graph_probs[data.n1[index]])
+
+            # Evaluate four possible assignments (00, 01, 10, 11)
+            temp[1] = temp_r_v + torch.rand(
+                graph_probs.size(dim=1), device=device
+            ) * 0.1 + add_0
+            temp[2] = temp_c_v + torch.rand(
+                graph_probs.size(dim=1), device=device
+            ) * 0.1 + add_1
+            temp[0] = (
+                temp[1] + temp[2]
+                + torch.rand(
+                    graph_probs.size(dim=1), device=device
+                ) * 0.1
+                - add_2
+            )
+
+            # Select the best assignment
+            max_index = torch.argmax(temp, dim=0)
+            graph_probs[node_r] = torch.floor(max_index / 2)
+            graph_probs[node_c] = max_index % 2
+
+        # Stop local search after num_ls iterations
+        if cnt >= num_ls:
+            break
+
+    # -----------------------------
+    # Energy computation
+    # -----------------------------
+
+    # Convert spins to Ising convention {-1, +1}
+    spins = 2 * graph_probs - 1  # [num_nodes, chains]
+
+    # Ising energy: E(s) = - sum_{(i,j)} J_ij * s_i * s_j
+    energy_vec = (
+        edge_weight * (spins[nlr_graph] * spins[nlc_graph])
+    ).sum(dim=0)  # [chains]
+
+    # Advantage for policy gradient (baseline = mean energy)
+    adv = (energy_vec - energy_vec.mean()).to(device)
+
+    # -----------------------------
+    # Select best chain per group
+    # -----------------------------
+    energy_reshape = energy_vec.view(-1, total_mcmc_num)
+    idx = torch.argmin(energy_reshape, dim=0)
+
+    # Convert local indices to global chain indices
+    for i0 in range(total_mcmc_num):
+        idx[i0] = i0 + idx[i0] * total_mcmc_num
+
+    temp_min = energy_vec[idx]              # [total_mcmc_num]
+    temp_min_info = graph_probs[:, idx]     # [num_nodes, total_mcmc_num]
+    start_samples = start                   # [num_nodes, chains]
+
+    # Return:
+    # 1) best energies,
+    # 2) configurations achieving best energies,
+    # 3) starting samples,
+    # 4) advantage for policy gradient update
+    return temp_min, temp_min_info, start_samples, adv
+'''
+Network
+'''
+class simple(torch.nn.Module):
+    """
+    A simple probabilistic policy network.
+
+    This model learns a vector of Bernoulli probabilities using a single
+    linear layer followed by a sigmoid activation. It is mainly used as
+    a policy network in Monte Carlo Policy Gradient (MCPG) methods.
+    """
+
+    def __init__(self, output_num):
+        """
+        Initialize the model.
+
+        Parameters
+        ----------
+        output_num : int
+            Number of output probabilities (e.g., number of nodes/spins).
+        """
+        super(simple, self).__init__()
+
+        # Linear layer mapping a constant input to output probabilities
+        self.lin = torch.nn.Linear(1, output_num)
+
+        # Sigmoid activation to constrain outputs to (0, 1)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def reset_parameters(self):
+        """Reset learnable parameters of the linear layer."""
+        self.lin.reset_parameters()
+
+    def forward(self, alpha=0.1, start_samples=None, value=None,
+                device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+        """
+        Forward pass of the policy network.
+
+        Depending on whether `start_samples` is provided, this function
+        either:
+          - returns probabilities and regularization only (inference mode), or
+          - computes a policy-gradient loss (training mode).
+
+        Parameters
+        ----------
+        alpha : float
+            Regularization coefficient for entropy-like penalty.
+        start_samples : torch.Tensor or None
+            Initial samples used for policy gradient computation,
+            shape (num_nodes, num_chains).
+        value : torch.Tensor or None
+            Advantage or reward signal per chain, shape (num_chains,).
+        device : torch.device
+            Device for computation.
+
+        Returns
+        -------
+        dict
+            Dictionary containing model outputs, regularization, and loss.
+        """
+
+        # Constant input (dummy input)
+        x = torch.ones(1).to(device)
+
+        # Linear transformation followed by sigmoid
+        x = self.lin(x)
+        x = self.sigmoid(x)
+
+        # Slightly restrict probability range to avoid numerical extremes
+        x = (x - 0.5) * 0.6 + 0.5
+        probs = x.squeeze()
+
+        retdict = {}
+
+        # Entropy-style regularization term
+        # Encourages exploration and avoids probability collapse
+        reg = probs * torch.log(probs) + (1 - probs) * torch.log(1 - probs)
+        reg = torch.mean(reg)
+
+        # --------------------------------------------------
+        # Inference mode: no samples provided
+        # --------------------------------------------------
+        if start_samples is None:
+            retdict["output"] = [probs.squeeze(-1), "hist"]
+            retdict["reg"] = [reg, "sequence"]
+            retdict["loss"] = [alpha * reg, "sequence"]
+            return retdict
+
+        # --------------------------------------------------
+        # Training mode: policy gradient loss
+        # --------------------------------------------------
+
+        # Detach reward/advantage from computation graph
+        res_samples = value.t().detach()
+
+        # Probability of observing start_samples under Bernoulli policy
+        start_samples_idx = (
+            start_samples * probs + (1 - start_samples) * (1 - probs)
+        )
+
+        # Log-likelihood of each sampled configuration
+        log_start_samples_idx = torch.log(start_samples_idx)
+        log_start_samples = log_start_samples_idx.sum(dim=1)
+
+        # REINFORCE-style policy gradient loss
+        loss_ls = torch.mean(log_start_samples * res_samples)
+
+        # Total loss = policy loss + regularization
+        loss = loss_ls + alpha * reg
+
+        retdict["output"] = [probs.squeeze(-1), "hist"]
+        retdict["reg"] = [reg, "sequence"]
+        retdict["loss"] = [loss, "sequence"]
+        return retdict
+
+    def __repr__(self):
+        """Return class name for clean printing."""
+        return self.__class__.__name__
+
+'''
+Algorithm
+'''
+def mcpg_solver(nvar, config, data, verbose=False):
+    """
+    Monte Carlo Policy Gradient (MCPG) solver for combinatorial optimization
+    problems formulated as Ising models.
+
+    This solver alternates between:
+      1) Policy network optimization via REINFORCE,
+      2) MCMC-based sampling (Metropolis + local search),
+      3) Selecting elite samples to guide the next iteration.
+
+    Parameters
+    ----------
+    nvar : int
+        Number of variables / nodes in the problem.
+    config : dict
+        Configuration dictionary containing training and sampling hyperparameters.
+    data : object
+        Graph or problem-specific data structure.
+    verbose : bool
+        Whether to print intermediate optimization results.
+
+    Returns
+    -------
+    float
+        Best (minimum) objective value found.
+    torch.Tensor
+        Best solution configuration.
+    torch.Tensor
+        Energy values of the best solutions per chain.
+    torch.Tensor
+        Solution configurations corresponding to now_max_res.
+    """
+
+    # Select computation device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    # Sampling function (Ising MCPG sampler)
+    sampler = mcpg_sampling_ising
+
+    # Number of accepted Metropolis transitions per chain
+    change_times = int(nvar / 10)
+
+    # Initialize policy network
+    net = simple(nvar)
+    net.to(device).reset_parameters()
+
+    # Optimizer
+    optimizer = torch.optim.Adam(net.parameters(), lr=config['lr_init'])
+
+    # Initial samples are not available at the beginning
+    start_samples = None
+
+    # -----------------------------
+    # Main training loop
+    # -----------------------------
+    for epoch in range(config['max_epoch_num']):
+
+        # Periodically reset network parameters
+        if epoch % config['reset_epoch_num'] == 0:
+            net.to(device).reset_parameters()
+            regular = config['regular_init']
+
+        net.train()
+
+        # Inference-only mode for the first epoch
+        if epoch <= 0:
+            retdict = net(regular, None, None)
+        else:
+            retdict = net(regular, start_samples, value)
+
+        # Backpropagation
+        retdict["loss"][0].backward()
+
+        # Gradient clipping for stability
+        torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
+
+        optimizer.step()
+
+        # --------------------------------------------------
+        # Initial sampling (epoch 0)
+        # --------------------------------------------------
+        if epoch == 0:
+            # Start from uniform Bernoulli probabilities
+            probs = (torch.zeros(nvar) + 0.5).to(device)
+
+            # Initialize MCMC samples
+            tensor_probs = sample_initializer(
+                config["problem_type"], probs, config, data=data
+            )
+
+            # Run sampler without Metropolis transitions
+            temp_max, temp_max_info, temp_start_samples, value = sampler(
+                data, tensor_probs, probs,
+                config['num_ls'], 0, config['total_mcmc_num']
+            )
+
+            # Store current best solutions
+            now_max_res = temp_max
+            now_max_info = temp_max_info
+
+            # Repeat elite samples for next iteration
+            tensor_probs = temp_max_info.clone()
+            tensor_probs = tensor_probs.repeat(1, config['repeat_times'])
+
+            # Initialize start samples for policy gradient
+            start_samples = temp_start_samples.t().to(device)
+
+        # --------------------------------------------------
+        # Periodic resampling and elite selection
+        # --------------------------------------------------
+        if epoch % config['sample_epoch_num'] == 0 and epoch > 0:
+            # Get probabilities predicted by policy network
+            probs = retdict["output"][0].detach()
+
+            # Run MCPG sampler
+            temp_max, temp_max_info, start_samples_temp, value = sampler(
+                data, tensor_probs, probs,
+                config['num_ls'], change_times, config['total_mcmc_num']
+            )
+
+            # Update best solutions per chain
+            for i0 in range(config['total_mcmc_num']):
+                if temp_max[i0] < now_max_res[i0]:
+                    now_max_res[i0] = temp_max[i0]
+                    now_max_info[:, i0] = temp_max_info[:, i0]
+
+            # Identify best and worst chains
+            now_max = min(now_max_res).item()
+            now_max_index = torch.argmin(now_max_res)
+
+            now_min = max(now_max_res).item()
+            now_min_index = torch.argmax(now_max_res)
+
+            # Replace the worst chain with the best one
+            now_max_res[now_min_index] = now_max
+            now_max_info[:, now_min_index] = now_max_info[:, now_max_index]
+            temp_max_info[:, now_min_index] = now_max_info[:, now_max_index]
+
+            # Prepare samples for the next iteration
+            tensor_probs = temp_max_info.clone()
+            tensor_probs = tensor_probs.repeat(1, config['repeat_times'])
+
+            start_samples = start_samples_temp.t()
+
+            # Optional logging
+            if verbose:
+                if config["problem_type"] == "maxsat" and len(data.pdata) == 7:
+                    res = max(now_max_res).item()
+                    if res > data.pdata[5] * data.pdata[6]:
+                        res -= data.pdata[5] * data.pdata[6]
+                        print("o {:.3f}".format(res))
+                elif "obj_type" in config and config["obj_type"] == "neg":
+                    print("o {:.3f}".format((-min(now_max_res).item())))
+                else:
+                    print("o {:f}".format((min(now_max_res).item())))
+
+        # Explicitly free dictionary
+        del retdict
+
+    # -----------------------------
+    # Final result selection
+    # -----------------------------
+    total_max = now_max_res
+    best_sort = torch.argsort(now_max_res, descending=False)
+    total_best_info = torch.squeeze(now_max_info[:, best_sort[0]])
+
+    return min(total_max).item(), total_best_info, now_max_res, now_max_info
+
+'''
+Dataloader
+'''
+def maxcut_dataloader(path,
+                      device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    """
+    Load a Max-Cut instance from file and construct a graph data object
+    with additional preprocessing for MCPG / local search.
+
+    The input file format is expected to be:
+        line 1:  <num_nodes> <num_edges>
+        line i:  <node_u> <node_v> <edge_weight>
+
+    Node indices in the file are assumed to be 1-based.
+
+    Parameters
+    ----------
+    path : str
+        Path to the Max-Cut instance file.
+    device : torch.device
+        Device on which tensors are allocated.
+
+    Returns
+    -------
+    data_maxcut : Data
+        PyTorch Geometric Data object with auxiliary attributes used by MCPG.
+    num_nodes : int
+        Number of nodes in the graph.
+    """
+
+    # -----------------------------
+    # Read graph from file
+    # -----------------------------
+    with open(path) as f:
+        fline = f.readline().split()
+        num_nodes, num_edges = int(fline[0]), int(fline[1])
+
+        # Edge index tensor (2 x num_edges)
+        edge_index = torch.LongTensor(2, num_edges)
+
+        # Edge weight tensor (num_edges x 1)
+        edge_attr = torch.Tensor(num_edges, 1)
+
+        cnt = 0
+        while True:
+            lines = f.readlines(num_edges * 2)
+            if not lines:
+                break
+            for line in lines:
+                line = line.rstrip('\n').split()
+                edge_index[0][cnt] = int(line[0]) - 1
+                edge_index[1][cnt] = int(line[1]) - 1
+                edge_attr[cnt][0] = float(line[2])
+                cnt += 1
+
+        # Create PyTorch Geometric data object
+        data_maxcut = Data(
+            num_nodes=num_nodes,
+            edge_index=edge_index,
+            edge_attr=edge_attr
+        )
+        data_maxcut = data_maxcut.to(device)
+
+        # Total sum of edge weights
+        data_maxcut.edge_weight_sum = float(
+            torch.sum(data_maxcut.edge_attr)
+        )
+
+        # -----------------------------
+        # Build neighbor structures
+        # -----------------------------
+        data_maxcut = append_neighbors(data_maxcut)
+
+        # -----------------------------
+        # Compute node degrees
+        # -----------------------------
+        data_maxcut.single_degree = []
+        data_maxcut.weighted_degree = []
+        tensor_abs_weighted_degree = []
+
+        for i0 in range(data_maxcut.num_nodes):
+            # Number of neighbors
+            data_maxcut.single_degree.append(
+                len(data_maxcut.neighbors[i0])
+            )
+
+            # Sum of incident edge weights
+            data_maxcut.weighted_degree.append(
+                float(torch.sum(data_maxcut.neighbor_edges[i0]))
+            )
+
+            # Sum of absolute incident edge weights
+            tensor_abs_weighted_degree.append(
+                float(torch.sum(
+                    torch.abs(data_maxcut.neighbor_edges[i0])
+                ))
+            )
+
+        tensor_abs_weighted_degree = torch.tensor(
+            tensor_abs_weighted_degree
+        )
+
+        # Nodes sorted by absolute weighted degree
+        data_maxcut.sorted_degree_nodes = torch.argsort(
+            tensor_abs_weighted_degree, descending=True
+        )
+
+        # -----------------------------
+        # Compute edge scores and auxiliary constants
+        # -----------------------------
+        edge_degree = []
+        add = torch.zeros(3, num_edges).to(device)
+
+        for i0 in range(num_edges):
+            # Edge importance score (used for ordering)
+            edge_degree.append(
+                abs(edge_attr[i0].item()) * (
+                    tensor_abs_weighted_degree[edge_index[0][i0]] +
+                    tensor_abs_weighted_degree[edge_index[1][i0]]
+                )
+            )
+
+            node_r = edge_index[0][i0]
+            node_c = edge_index[1][i0]
+
+            # Precomputed constants for local search updates
+            add[0][i0] = (
+                - data_maxcut.weighted_degree[node_r] / 2
+                + data_maxcut.edge_attr[i0] - 0.05
+            )
+            add[1][i0] = (
+                - data_maxcut.weighted_degree[node_c] / 2
+                + data_maxcut.edge_attr[i0] - 0.05
+            )
+            add[2][i0] = data_maxcut.edge_attr[i0] + 0.05
+
+        # Ensure neighbor edge tensors have batch dimension
+        for i0 in range(num_nodes):
+            data_maxcut.neighbor_edges[i0] = (
+                data_maxcut.neighbor_edges[i0].unsqueeze(0)
+            )
+
+        data_maxcut.add = add
+
+        # Sort edges by importance
+        edge_degree = torch.tensor(edge_degree)
+        data_maxcut.sorted_degree_edges = torch.argsort(
+            edge_degree, descending=True
+        )
+
+        return data_maxcut, num_nodes
+    
+def append_neighbors(data,
+                     device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    """
+    Augment a graph data object with neighbor and edge-adjacency information.
+
+    This function builds:
+      - Node-wise neighbor lists and incident edge weights
+      - Edge-wise neighbor node sets excluding the edge itself (n0, n1)
+      - Corresponding edge-weight tensors (n0_edges, n1_edges)
+
+    These auxiliary structures are used to accelerate local search updates
+    in MCPG-based Max-Cut / Ising optimization.
+
+    Parameters
+    ----------
+    data : Data
+        PyTorch Geometric Data object with attributes:
+        - num_nodes
+        - edge_index
+        - edge_attr
+    device : torch.device
+        Device on which tensors are allocated.
+
+    Returns
+    -------
+    Data
+        The input data object augmented with neighbor-related attributes.
+    """
+
+    # -----------------------------
+    # Initialize neighbor containers
+    # -----------------------------
+    data.neighbors = []        # neighbors[i]: list of neighbors of node i
+    data.neighbor_edges = []   # neighbor_edges[i]: edge weights incident to node i
+    num_nodes = data.num_nodes
+
+    for i in range(num_nodes):
+        data.neighbors.append([])
+        data.neighbor_edges.append([])
+
+    edge_number = data.edge_index.shape[1]
+
+    # -----------------------------
+    # Build node-wise adjacency lists
+    # -----------------------------
+    for index in range(edge_number):
+        row = data.edge_index[0][index]
+        col = data.edge_index[1][index]
+        edge_weight = data.edge_attr[index][0].item()
+
+        # Undirected graph: add both directions
+        data.neighbors[row].append(col.item())
+        data.neighbor_edges[row].append(edge_weight)
+
+        data.neighbors[col].append(row.item())
+        data.neighbor_edges[col].append(edge_weight)
+
+    # -----------------------------
+    # Build edge-wise neighbor sets
+    # -----------------------------
+    data.n0 = []        # neighbors of edge's first endpoint (excluding second)
+    data.n1 = []        # neighbors of edge's second endpoint (excluding first)
+    data.n0_edges = []  # corresponding edge weights
+    data.n1_edges = []
+
+    for index in range(edge_number):
+        row = data.edge_index[0][index]
+        col = data.edge_index[1][index]
+
+        # Copy full neighbor lists
+        data.n0.append(data.neighbors[row].copy())
+        data.n1.append(data.neighbors[col].copy())
+        data.n0_edges.append(data.neighbor_edges[row].copy())
+        data.n1_edges.append(data.neighbor_edges[col].copy())
+
+        # Remove the current edge endpoint from each list
+        for i in range(len(data.n0[index])):
+            if data.n0[index][i] == col:
+                break
+        data.n0[index].pop(i)
+        data.n0_edges[index].pop(i)
+
+        for i in range(len(data.n1[index])):
+            if data.n1[index][i] == row:
+                break
+        data.n1[index].pop(i)
+        data.n1_edges[index].pop(i)
+
+        # Convert to tensors
+        data.n0[index] = torch.LongTensor(data.n0[index]).to(device)
+        data.n1[index] = torch.LongTensor(data.n1[index]).to(device)
+
+        data.n0_edges[index] = torch.tensor(
+            data.n0_edges[index]
+        ).unsqueeze(0).to(device)
+        data.n1_edges[index] = torch.tensor(
+            data.n1_edges[index]
+        ).unsqueeze(0).to(device)
+
+    # -----------------------------
+    # Convert node-wise lists to tensors
+    # -----------------------------
+    for i in range(num_nodes):
+        data.neighbors[i] = torch.LongTensor(data.neighbors[i]).to(device)
+        data.neighbor_edges[i] = torch.tensor(
+            data.neighbor_edges[i]
+        ).to(device)
+
+    return data
+
+
+if __name__ == "__main__":
+    """
+    Main entry point for running the MCPG solver.
+
+    This script:
+      1) Parses command-line arguments,
+      2) Loads configuration and problem instance,
+      3) Runs the MCPG solver,
+      4) Prints the final result and timing statistics.
+    """
+
+    # Select computation device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    # -----------------------------
+    # Argument parsing
+    # -----------------------------
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "config_file",
+        type=str,
+        help="Path to the YAML configuration file for the MCPG solver"
+    )
+    parser.add_argument(
+        "problem_instance",
+        type=str,
+        help="Path to the problem instance file (e.g., Max-Cut / Max-SAT)"
+    )
+
+    args = parser.parse_args()
+
+    # -----------------------------
+    # Load configuration
+    # -----------------------------
+    with open(args.config_file) as f:
+        config = yaml.safe_load(f)
+
+    path = args.problem_instance
+
+    # -----------------------------
+    # Data loading
+    # -----------------------------
+    start_time = time.perf_counter()
+
+    dataloader = maxcut_dataloader
+    data, nvar = dataloader(path)
+
+    dataloader_t = time.perf_counter()
+
+    # -----------------------------
+    # Run MCPG solver
+    # -----------------------------
+    res, solutions, _, _ = mcpg_solver(
+        nvar, config, data, verbose=True
+    )
+
+    mcpg_t = time.perf_counter()
+
+    # -----------------------------
+    # Output post-processing
+    # -----------------------------
+    if config["problem_type"] == "maxsat" and len(data.pdata) == 7:
+        # Special handling for Max-SAT instances
+        if res > data.pdata[5] * data.pdata[6]:
+            res -= data.pdata[5] * data.pdata[6]
+            print("SATISFIED")
+            print("SATISFIED SOFT CLAUSES:", res)
+            print(
+                "UNSATISFIED SOFT CLAUSES:",
+                data.pdata[1] - data.pdata[-1] - res
+            )
+        else:
+            res = res // data.pdata[5] - data.pdata[6]
+            print("UNSATISFIED")
+
+    elif "obj_type" in config and config["obj_type"] == "neg":
+        # Objective is negated
+        print("OUTPUT: {:.2f}".format(-res))
+
+    else:
+        # Default output
+        print("OUTPUT: {:f}".format(res))
+
+    # -----------------------------
+    # Timing statistics
+    # -----------------------------
+    print(
+        "DATA LOADING TIME: {:f}".format(
+            dataloader_t - start_time
+        )
+    )
+    print(
+        "MCPG RUNNING TIME: {:f}".format(
+            mcpg_t - dataloader_t
+        )
+    )
\ No newline at end of file
diff --git a/src/tutorials/.ipynb_checkpoints/eco_dqn-checkpoint.ipynb b/src/tutorials/.ipynb_checkpoints/eco_dqn-checkpoint.ipynb
new file mode 100644
index 0000000..363fcab
--- /dev/null
+++ b/src/tutorials/.ipynb_checkpoints/eco_dqn-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/tutorials/MCPG/.ipynb_checkpoints/mcpg-checkpoint.ipynb b/src/tutorials/MCPG/.ipynb_checkpoints/mcpg-checkpoint.ipynb
new file mode 100644
index 0000000..09ca138
--- /dev/null
+++ b/src/tutorials/MCPG/.ipynb_checkpoints/mcpg-checkpoint.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cc5968fb-06de-4d8c-a212-da153e2c171d",
+   "metadata": {},
+   "source": [
+    "# MCPG Tutorial (Quickstart Notebook)\n",
+    "\n",
+    "This notebook walks you through setting up and exploring the **[optsuite/MCPG](https://github.com/optsuite/MCPG)** repository locally, including:\n",
+    "\n",
+    "1. Environment setup (conda / pip)\n",
+    "2. Cloning the repository\n",
+    "3. Adding the project to `PYTHONPATH`\n",
+    "4. Exploring the configuration files under `config/`\n",
+    "5. A minimal workflow to **train** and **evaluate** with the provided drivers (if available)\n",
+    "6. Tips for customizing experiments and logging results\n",
+    "\n",
+    "> **Note:** This notebook is designed to be run on your machine (or Colab). Some cells use shell commands like `git` or `python` that require internet access and a proper environment.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4171ae3d-aa26-4367-8353-f9946b0c6a57",
+   "metadata": {},
+   "source": [
+    "## 0) Prerequisites\n",
+    "\n",
+    "- **Python 3.10+** (match the repo's `environment.yaml` if provided)\n",
+    "- **conda** or **mamba** (recommended)\n",
+    "- **git**\n",
+    "- (Optional) **CUDA** if you plan to use GPU acceleration\n",
+    "\n",
+    "If the repository provides an `environment.yaml`, you can create a conda environment with:\n",
+    "\n",
+    "```bash\n",
+    "conda env create -f environment.yaml -n mcpg\n",
+    "conda activate mcpg\n",
+    "```\n",
+    "\n",
+    "If you prefer `mamba`:\n",
+    "```bash\n",
+    "mamba env create -f environment.yaml -n mcpg\n",
+    "mamba activate mcpg\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c40445b5-47a5-4658-b752-1605101db4d6",
+   "metadata": {},
+   "source": [
+    "## 1) Clone the repository\n",
+    "\n",
+    "Run the following cell to clone the repo next to this notebook (skip if you've already cloned it)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4e4f8b3-356f-444a-9de0-058d8a63eea0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If running locally/Colab, uncomment to clone:\n",
+    "# !git clone https://github.com/optsuite/MCPG.git\n",
+    "# %cd MCPG\n",
+    "print('If you run this notebook locally, uncomment the git commands above.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "647b2dc7-2729-44ad-a1c5-b3321f43c98d",
+   "metadata": {},
+   "source": [
+    "## 2) Add project to PYTHONPATH\n",
+    "\n",
+    "Many repos use a `src/` layout. We'll add it to `sys.path` so Python can find the modules."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aed436cd-eeb4-41b6-8be8-d3a74b69cd2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, sys\n",
+    "repo_root = os.path.abspath('.')  # if you've done %cd MCPG already\n",
+    "src_path = os.path.join(repo_root, 'src')\n",
+    "if os.path.isdir(src_path) and src_path not in sys.path:\n",
+    "    sys.path.insert(0, src_path)\n",
+    "print('Added to sys.path:', src_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (maxcut)",
+   "language": "python",
+   "name": "maxcut"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/tutorials/MCPG/dataloader.py b/src/tutorials/MCPG/dataloader.py
new file mode 100644
index 0000000..54b6632
--- /dev/null
+++ b/src/tutorials/MCPG/dataloader.py
@@ -0,0 +1,309 @@
+"""
+Copyright (c) 2024 Cheng Chen, Ruitao Chen, Tianyou Li, Zaiwen Wen
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+   following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+   and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+import os
+from torch_geometric.data import Data
+import numpy as np
+import scipy.io as scio
+
+
+def dataloader_select(problem_type):
+    if problem_type in ["maxcut", "maxcut_edge", "r_cheegercut", "n_cheegercut", 'ising']:
+        return maxcut_dataloader
+    elif problem_type == "maxsat":
+        return maxsat_dataloader
+    elif problem_type in ["qubo", "qubo_bin"]:
+        return qubo_dataloader
+    else:
+        raise (Exception("Unrecognized problem type {}".format(problem_type)))
+
+
+def maxcut_dataloader(path, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    with open(path) as f:
+        fline = f.readline()
+        fline = fline.split()
+        num_nodes, num_edges = int(fline[0]), int(fline[1])
+        edge_index = torch.LongTensor(2, num_edges)
+        edge_attr = torch.Tensor(num_edges, 1)
+        cnt = 0
+        while True:
+            lines = f.readlines(num_edges * 2)
+            if not lines:
+                break
+            for line in lines:
+                line = line.rstrip('\n').split()
+                edge_index[0][cnt] = int(line[0]) - 1
+                edge_index[1][cnt] = int(line[1]) - 1
+                edge_attr[cnt][0] = float(line[2])
+                cnt += 1
+        data_maxcut = Data(num_nodes=num_nodes,
+                           edge_index=edge_index, edge_attr=edge_attr)
+        data_maxcut = data_maxcut.to(device)
+        data_maxcut.edge_weight_sum = float(torch.sum(data_maxcut.edge_attr))
+
+        data_maxcut = append_neighbors(data_maxcut)
+
+        data_maxcut.single_degree = []
+        data_maxcut.weighted_degree = []
+        tensor_abs_weighted_degree = []
+        for i0 in range(data_maxcut.num_nodes):
+            data_maxcut.single_degree.append(len(data_maxcut.neighbors[i0]))
+            data_maxcut.weighted_degree.append(
+                float(torch.sum(data_maxcut.neighbor_edges[i0])))
+            tensor_abs_weighted_degree.append(
+                float(torch.sum(torch.abs(data_maxcut.neighbor_edges[i0]))))
+        tensor_abs_weighted_degree = torch.tensor(tensor_abs_weighted_degree)
+        data_maxcut.sorted_degree_nodes = torch.argsort(
+            tensor_abs_weighted_degree, descending=True)
+
+        edge_degree = []
+        add = torch.zeros(3, num_edges).to(device)
+        for i0 in range(num_edges):
+            edge_degree.append(abs(edge_attr[i0].item())*(
+                tensor_abs_weighted_degree[edge_index[0][i0]]+tensor_abs_weighted_degree[edge_index[1][i0]]))
+            node_r = edge_index[0][i0]
+            node_c = edge_index[1][i0]
+            add[0][i0] = - data_maxcut.weighted_degree[node_r] / \
+                2 + data_maxcut.edge_attr[i0] - 0.05
+            add[1][i0] = - data_maxcut.weighted_degree[node_c] / \
+                2 + data_maxcut.edge_attr[i0] - 0.05
+            add[2][i0] = data_maxcut.edge_attr[i0]+0.05
+
+        for i0 in range(num_nodes):
+            data_maxcut.neighbor_edges[i0] = data_maxcut.neighbor_edges[i0].unsqueeze(
+                0)
+        data_maxcut.add = add
+        edge_degree = torch.tensor(edge_degree)
+        data_maxcut.sorted_degree_edges = torch.argsort(
+            edge_degree, descending=True)
+
+        return data_maxcut, num_nodes
+
+
+def append_neighbors(data, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    data.neighbors = []
+    data.neighbor_edges = []
+    num_nodes = data.num_nodes
+    for i in range(num_nodes):
+        data.neighbors.append([])
+        data.neighbor_edges.append([])
+    edge_number = data.edge_index.shape[1]
+
+    for index in range(0, edge_number):
+        row = data.edge_index[0][index]
+        col = data.edge_index[1][index]
+        edge_weight = data.edge_attr[index][0].item()
+
+        data.neighbors[row].append(col.item())
+        data.neighbor_edges[row].append(edge_weight)
+        data.neighbors[col].append(row.item())
+        data.neighbor_edges[col].append(edge_weight)
+
+    data.n0 = []
+    data.n1 = []
+    data.n0_edges = []
+    data.n1_edges = []
+    for index in range(0, edge_number):
+        row = data.edge_index[0][index]
+        col = data.edge_index[1][index]
+        data.n0.append(data.neighbors[row].copy())
+        data.n1.append(data.neighbors[col].copy())
+        data.n0_edges.append(data.neighbor_edges[row].copy())
+        data.n1_edges.append(data.neighbor_edges[col].copy())
+        i = 0
+        for i in range(len(data.n0[index])):
+            if data.n0[index][i] == col:
+                break
+        data.n0[index].pop(i)
+        data.n0_edges[index].pop(i)
+        for i in range(len(data.n1[index])):
+            if data.n1[index][i] == row:
+                break
+        data.n1[index].pop(i)
+        data.n1_edges[index].pop(i)
+
+        data.n0[index] = torch.LongTensor(data.n0[index]).to(device)
+        data.n1[index] = torch.LongTensor(data.n1[index]).to(device)
+        data.n0_edges[index] = torch.tensor(
+            data.n0_edges[index]).unsqueeze(0).to(device)
+        data.n1_edges[index] = torch.tensor(
+            data.n1_edges[index]).unsqueeze(0).to(device)
+
+    for i in range(num_nodes):
+        data.neighbors[i] = torch.LongTensor(data.neighbors[i]).to(device)
+        data.neighbor_edges[i] = torch.tensor(
+            data.neighbor_edges[i]).to(device)
+
+    return data
+
+
+class Data_MaxSAT(object):
+    def __init__(self, pdata=None, ndata=None):
+        self.pdata = pdata
+        self.ndata = ndata
+
+
+def maxsat_dataloader(path, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    ext = os.path.splitext(path)[-1]
+    if ext == ".cnf":
+        ptype = 'n'
+    elif ext == ".wcnf":
+        ptype = 'p'
+    else:
+        raise (Exception("Unrecognized file type {}".format(path)))
+
+    with open(path) as f:
+        lines = f.readlines()
+        variable_index = []
+        clause_index = []
+        neg_index = []
+        clause_cnt = 0
+        nhard = 0
+        nvi = []
+        nci = []
+        nneg = []
+        tempvi = []
+        tempneg = []
+        vp = []
+        vn = []
+        for line in lines:
+            line = line.split()
+            if len(line) == 0:
+                continue
+            elif line[0] == "c":
+                continue
+            elif line[0] == "p":
+                if ptype == 'p':
+                    weight = int(line[4])
+                nvar, nclause = int(line[2]), int(line[3])
+                for i0 in range(nvar):
+                    nvi.append([])
+                    nci.append([])
+                    nneg.append([])
+                vp = [0]*nvar
+                vn = [0]*nvar
+                continue
+            tempvi = []
+            tempneg = []
+            if ptype == 'p':
+                clause_weight_i = int(line[0])
+                if clause_weight_i == weight:
+                    nhard += 1
+                for ety in line[1:-1]:
+                    ety = int(ety)
+                    variable_index.append(abs(ety) - 1)
+                    tempvi.append(abs(ety) - 1)
+                    clause_index.append(clause_cnt)
+                    neg_index.append(int(ety/abs(ety))*clause_weight_i)
+                    tempneg.append(int(ety/abs(ety))*clause_weight_i)
+                    if ety > 0:
+                        vp[abs(ety) - 1] += 1
+                    else:
+                        vn[abs(ety) - 1] += 1
+            else:
+                for ety in line:
+                    if ety == '0':
+                        continue
+                    ety = int(ety)
+                    variable_index.append(abs(ety) - 1)
+                    tempvi.append(abs(ety) - 1)
+                    clause_index.append(clause_cnt)
+                    neg_index.append(int(ety/abs(ety)))
+                    tempneg.append(int(ety/abs(ety)))
+                    if ety > 0:
+                        vp[abs(ety) - 1] += 1
+                    else:
+                        vn[abs(ety) - 1] += 1
+            for i0 in range(len(tempvi)):
+                node = tempvi[i0]
+                nvi[node] += tempvi
+                nneg[node] += tempneg
+                temp = len(nci[node])
+                if temp > 0:
+                    temp = nci[node][temp-1]+1
+                nci[node] += [temp]*len(tempvi)
+            clause_cnt += 1
+    degree = []
+    for i0 in range(nvar):
+        nvi[i0] = torch.LongTensor(nvi[i0]).to(device)
+        nci[i0] = torch.LongTensor(nci[i0]).to(device)
+        nneg[i0] = torch.tensor(nneg[i0]).to(device)
+        degree.append(vp[i0]+vn[i0])
+    degree = torch.FloatTensor(degree).to(device)
+    sorted = torch.argsort(degree, descending=True).to('cpu')
+    neg_index = torch.tensor(neg_index).to(device)
+    ci_cuda = torch.tensor(clause_index).to(device)
+
+    ndata = [nvi, nci, nneg, sorted, degree]
+    ndata = sort_node(ndata)
+
+    pdata = [nvar, nclause, variable_index, ci_cuda, neg_index]
+    if ptype == 'p':
+        pdata = [nvar, nclause, variable_index,
+                 ci_cuda, neg_index, weight, nhard]
+    return Data_MaxSAT(pdata=pdata, ndata=ndata), pdata[0]
+
+
+def sort_node(ndata):
+    degree = ndata[4]
+    device = degree.device
+    temp = degree + (torch.rand(degree.shape[0], device=device)-0.5)/2
+    sorted = torch.argsort(temp, descending=True).to('cpu')
+    ndata[3] = sorted
+    return ndata
+
+
+def qubo_dataloader(path, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    Q = np.load(path)
+    Q = torch.tensor(Q).float().to(device)
+    data = {'Q': Q, 'nvar': Q.shape[0]}
+    return data, Q.shape[0]
+
+
+def read_data_mimo(K, N, SNR, X_num, r_seed, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+
+    path = "../data/mimo/4QAM{}_{}/4QAM{}H{}.mat".format(N, K, K, int(r_seed//X_num+1))
+    data = scio.loadmat(path)
+    H = data["save_H"]
+    path = "../data/mimo/4QAM{}_{}/4QAM{}X{}.mat".format(N, K, K, int(r_seed//X_num+1))
+    data = scio.loadmat(path)
+    X = data["save_X"][r_seed % X_num]
+    path = "../data/mimo/4QAM{}_{}/4QAM{}v{}.mat".format(N, K, K, int(r_seed//X_num+1))
+    data = scio.loadmat(path)
+    v = data["save_v"][r_seed % X_num]
+    v = np.sqrt(2*K*10**(-SNR/10)) * v
+
+    Y = H.dot(X) + v
+    noise = np.linalg.norm(v)
+
+    Sigma = H.T.dot(H)
+    Diag = -2*Y.T.dot(H)
+    sca = Y.T.dot(Y)
+    for i in range(Sigma.shape[0]):
+        sca += Sigma[i][i]
+        Sigma[i][i] = 0
+
+    # to cuda
+    Sigma = torch.tensor(Sigma).to(device)
+    Diag = torch.tensor(Diag).to(device)
+    X = torch.tensor(X).to(device)
+    sca = torch.tensor(sca).to(device)
+
+    data = [Sigma, Diag, X, sca, noise]
+    return data
+
diff --git a/src/tutorials/MCPG/inference.py b/src/tutorials/MCPG/inference.py
new file mode 100644
index 0000000..8689e93
--- /dev/null
+++ b/src/tutorials/MCPG/inference.py
@@ -0,0 +1,95 @@
+# --- inference.py ---
+import torch
+import yaml
+import argparse
+import time
+
+# 导入你的模型和数据加载器
+from src.model import simple, TransformerModel
+from dataloader import dataloader_select
+# 导入采样器，因为我们需要用模型生成的策略去采样
+from src.sampling import sampler_select, sample_initializer
+
+def main():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    
+    # 1. 设置命令行参数
+    parser = argparse.ArgumentParser(description="Generalization Test for MCPG Models")
+    parser.add_argument("config_file", type=str, help="Configuration file used for TRAINING the model.")
+    parser.add_argument("saved_model_path", type=str, help="Path to the saved .pth model file.")
+    parser.add_argument("test_instance", type=str, help="Path to the NEW, unseen problem instance for testing.")
+    args = parser.parse_args()
+
+    # 2. 加载配置文件以获取模型架构信息
+    with open(args.config_file) as f:
+        config = yaml.safe_load(f)
+
+    # 3. 加载新问题的测试数据
+    dataloader = dataloader_select(config["problem_type"])
+    test_data, nvar = dataloader(args.test_instance)
+    
+    # 4. 初始化模型架构 (必须和训练时完全一致)
+    model_type = config.get("model_type", "simple")
+    net = None
+    if model_type == "transformer":
+        print("===== Loading Transformer Model for Inference =====")
+        model_params = config['transformer_params']
+        net = TransformerModel(nvar=nvar, **model_params)
+    else:
+        print("===== Loading Simple Model for Inference =====")
+        net = simple(nvar=nvar)
+        
+    # 5. 加载已训练好的模型权重
+    net.load_state_dict(torch.load(args.saved_model_path, map_location=device))
+    net.to(device)
+    net.eval() # !!! 切换到评估模式，这会关闭dropout等训练特有的层
+
+    print(f"Model loaded from {args.saved_model_path}")
+    print(f"Testing on instance {args.test_instance} with {nvar} variables.")
+
+    # 6. 执行推理，获取策略 (probs)
+        # 6) 前向得到初始概率（只做一次，网络不更新）
+    with torch.no_grad():
+        if model_type == "transformer":
+            retdict = net(test_data, device=device)  # start_samples=None
+        else:
+            retdict = net(device=device)
+        probs = retdict["output"][0].detach().float()
+
+    # 数值安全（避免采样器里 Bernoulli 报错）
+    probs = torch.nan_to_num(probs, nan=0.5, posinf=1.0, neginf=0.0).clamp_(1e-6, 1-1e-6)
+
+    # 7) 按“训练期同款”的 MCPG 流程跑一轮（但不更新模型参数）
+    #    关键：让 sampler 自己做 MCMC 更换分布 + 多次局部搜索
+    sampler = sampler_select(config["problem_type"])
+
+    # 这些超参直接复用训练期配置（或给默认值）
+    num_ls_inference = int(config.get("num_ls_inference", config.get("num_ls", 12)))
+    change_times_inference = int(config.get("change_times_inference", config.get("change_times", 2)))
+    total_mcmc_num = int(config.get("total_mcmc_num", 2000))
+
+    # 用模型给的概率初始化一次样本（和训练时一样）
+    tensor_probs = sample_initializer(config["problem_type"], probs, config, data=test_data)
+
+    # 让 sampler 跑完整的一轮：MCMC 变化 change_times 次 + 每次跑若干 LS
+    # 注意：sampler 内部只会更新“当前分布/样本”，不会触碰神经网络参数
+    with torch.no_grad():
+        best_score, best_solution, _, _ = sampler(
+            test_data,
+            tensor_probs,
+            probs,                      # 初始分布
+            num_ls_inference,
+            change_times_inference,
+            total_mcmc_num
+        )
+
+    final_result = max(best_score).item()
+    if "obj_type" in config and config["obj_type"] == "neg":
+        print(f"OUTPUT: {-final_result:.2f}")
+    else:
+        print(f"OUTPUT: {final_result:.2f}")
+
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/tutorials/MCPG/mcpg.ipynb b/src/tutorials/MCPG/mcpg.ipynb
new file mode 100644
index 0000000..09ca138
--- /dev/null
+++ b/src/tutorials/MCPG/mcpg.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cc5968fb-06de-4d8c-a212-da153e2c171d",
+   "metadata": {},
+   "source": [
+    "# MCPG Tutorial (Quickstart Notebook)\n",
+    "\n",
+    "This notebook walks you through setting up and exploring the **[optsuite/MCPG](https://github.com/optsuite/MCPG)** repository locally, including:\n",
+    "\n",
+    "1. Environment setup (conda / pip)\n",
+    "2. Cloning the repository\n",
+    "3. Adding the project to `PYTHONPATH`\n",
+    "4. Exploring the configuration files under `config/`\n",
+    "5. A minimal workflow to **train** and **evaluate** with the provided drivers (if available)\n",
+    "6. Tips for customizing experiments and logging results\n",
+    "\n",
+    "> **Note:** This notebook is designed to be run on your machine (or Colab). Some cells use shell commands like `git` or `python` that require internet access and a proper environment.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4171ae3d-aa26-4367-8353-f9946b0c6a57",
+   "metadata": {},
+   "source": [
+    "## 0) Prerequisites\n",
+    "\n",
+    "- **Python 3.10+** (match the repo's `environment.yaml` if provided)\n",
+    "- **conda** or **mamba** (recommended)\n",
+    "- **git**\n",
+    "- (Optional) **CUDA** if you plan to use GPU acceleration\n",
+    "\n",
+    "If the repository provides an `environment.yaml`, you can create a conda environment with:\n",
+    "\n",
+    "```bash\n",
+    "conda env create -f environment.yaml -n mcpg\n",
+    "conda activate mcpg\n",
+    "```\n",
+    "\n",
+    "If you prefer `mamba`:\n",
+    "```bash\n",
+    "mamba env create -f environment.yaml -n mcpg\n",
+    "mamba activate mcpg\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c40445b5-47a5-4658-b752-1605101db4d6",
+   "metadata": {},
+   "source": [
+    "## 1) Clone the repository\n",
+    "\n",
+    "Run the following cell to clone the repo next to this notebook (skip if you've already cloned it)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4e4f8b3-356f-444a-9de0-058d8a63eea0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If running locally/Colab, uncomment to clone:\n",
+    "# !git clone https://github.com/optsuite/MCPG.git\n",
+    "# %cd MCPG\n",
+    "print('If you run this notebook locally, uncomment the git commands above.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "647b2dc7-2729-44ad-a1c5-b3321f43c98d",
+   "metadata": {},
+   "source": [
+    "## 2) Add project to PYTHONPATH\n",
+    "\n",
+    "Many repos use a `src/` layout. We'll add it to `sys.path` so Python can find the modules."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aed436cd-eeb4-41b6-8be8-d3a74b69cd2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, sys\n",
+    "repo_root = os.path.abspath('.')  # if you've done %cd MCPG already\n",
+    "src_path = os.path.join(repo_root, 'src')\n",
+    "if os.path.isdir(src_path) and src_path not in sys.path:\n",
+    "    sys.path.insert(0, src_path)\n",
+    "print('Added to sys.path:', src_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (maxcut)",
+   "language": "python",
+   "name": "maxcut"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/tutorials/MCPG/mcpg.py b/src/tutorials/MCPG/mcpg.py
new file mode 100644
index 0000000..6c401a1
--- /dev/null
+++ b/src/tutorials/MCPG/mcpg.py
@@ -0,0 +1,68 @@
+"""
+Copyright (c) 2024 Cheng Chen, Ruitao Chen, Tianyou Li, Zaiwen Wen
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+   following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+   and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+import yaml
+import argparse
+import time
+
+from mcpg_solver import mcpg_solver
+from dataloader import dataloader_select
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+parser = argparse.ArgumentParser()
+parser.add_argument("config_file", type=str,
+                    help="input the configuration file for the mcpg solver")
+parser.add_argument("problem_instance", type=str,
+                    help="input the data file for the problem instance")
+
+args = parser.parse_args()
+with open(args.config_file) as f:
+    config = yaml.safe_load(f)
+
+path = args.problem_instance
+start_time = time.perf_counter()
+dataloader = dataloader_select(config["problem_type"])
+data, nvar = dataloader(path)
+dataloader_t = time.perf_counter()
+res, solutions, _, _, net = mcpg_solver(nvar, config, data, verbose=True)
+mcpg_t = time.perf_counter()
+
+# 根据配置文件来决定保存路径s
+model_type = config.get("model_type", "simple")
+save_path = f"trained_{model_type}_model_{nvar}.pth"
+torch.save(net.state_dict(), save_path)
+print(f"===== Model saved to {save_path} =====")
+
+if config["problem_type"] == "maxsat" and len(data.pdata) == 7:
+    if res > data.pdata[5] * data.pdata[6]:
+        res -= data.pdata[5] * data.pdata[6]
+        print("SATISFIED")
+        print("SATISFIED SOFT CLAUSES:", res)
+        print("UNSATISFIED SOFT CLAUSES:", data.pdata[1] - data.pdata[-1] - res)
+    else: 
+        res = res//data.pdata[5]-data.pdata[6]
+        print("UNSATISFIED")
+        
+elif "obj_type" in config and config["obj_type"] == "neg":
+    print("OUTPUT: {:.2f}".format(-res))
+else:
+    print("OUTPUT: {:.2f}".format(res))
+
+
+print("DATA LOADING TIME: {:.2f}".format(dataloader_t - start_time))
+print("MCPG RUNNING TIME: {:.2f}".format(mcpg_t - dataloader_t))
diff --git a/src/tutorials/MCPG/mcpg_solver.py b/src/tutorials/MCPG/mcpg_solver.py
new file mode 100644
index 0000000..6d8d7ce
--- /dev/null
+++ b/src/tutorials/MCPG/mcpg_solver.py
@@ -0,0 +1,152 @@
+"""
+Copyright (c) 2024 Cheng Chen, Ruitao Chen, Tianyou Li, Zaiwen Wen
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+   following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+   and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+
+# 导入我们新增的 TransformerModel 和原始的 simple 模型
+from src.model import simple, TransformerModel
+from src.sampling import sampler_select, sample_initializer
+
+
+def mcpg_solver(nvar, config, data, verbose=False):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    sampler = sampler_select(config["problem_type"])
+
+    change_times = int(nvar/10)  # transition times for metropolis sampling
+
+    # ================= 动态模型选择 (Dynamic Model Selection) =================
+    # 从配置文件读取要使用的模型类型，如果未指定，则默认为 'simple'
+    model_type = config.get("model_type", "simple") 
+    
+    net = None
+    if model_type == "transformer":
+        print("===== Using Transformer Model =====")
+        # 从配置中读取Transformer的超参数
+        # 如果配置中没有 'transformer_params'，则会报错，这是为了提醒用户必须提供参数
+        model_params = config['transformer_params']
+        net = TransformerModel(
+            nvar=nvar,
+            embedding_dim=model_params['embedding_dim'],
+            num_heads=model_params['num_heads'],
+            num_layers=model_params['num_layers'],
+            dim_feedforward=model_params['dim_feedforward']
+        )
+    else: # 默认为 simple 模型
+        print("===== Using Simple Model =====")
+        model_type = "simple" # 确保变量被设置
+        net = simple(nvar)
+        
+    net.to(device)
+    optimizer = torch.optim.Adam(net.parameters(), lr=config['lr_init'])
+
+    start_samples = None
+    # 初始化用于存储最佳结果的变量
+    # 这一步是为了确保即使在第一个采样周期就找到好结果，也能被正确记录
+    now_max_res = torch.full((config['total_mcmc_num'],), float('-inf'), device=device)
+    now_max_info = torch.zeros(nvar, config['total_mcmc_num'], device=device)
+
+    for epoch in range(config['max_epoch_num']):
+
+        if epoch % config['reset_epoch_num'] == 0:
+            net.reset_parameters()
+            regular = config['regular_init']
+
+        net.train()
+
+        # ================= 动态模型调用 (Dynamic Model Invocation) =================
+        # 根据模型类型，决定调用 forward 时是否传入 data 对象
+        if model_type == "transformer":
+            # Transformer 模型需要 data 对象来构建注意力掩码
+            if epoch <= 0:
+                retdict = net(data, regular, None, None, device=device)
+            else:
+                retdict = net(data, regular, start_samples, value, device=device)
+        else: # simple 模型
+            if epoch <= 0:
+                retdict = net(regular, None, None, device=device)
+            else:
+                retdict = net(regular, start_samples, value, device=device)
+
+        retdict["loss"][0].backward()
+        torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
+        optimizer.step()
+
+        # get start samples (获取初始样本)
+        if epoch == 0:
+            probs = (torch.zeros(nvar)+0.5).to(device)
+            tensor_probs = sample_initializer(
+                config["problem_type"], probs, config, data=data)
+            temp_max, temp_max_info, temp_start_samples, value = sampler(
+                data, tensor_probs, probs, config['num_ls'], 0, config['total_mcmc_num'])
+            
+            # 首次运行时直接更新最佳结果
+            now_max_res = temp_max
+            now_max_info = temp_max_info
+            
+            tensor_probs = temp_max_info.clone()
+            tensor_probs = tensor_probs.repeat(1, config['repeat_times'])
+            start_samples = temp_start_samples.t().to(device)
+
+        # get samples (获取后续样本)
+        if epoch % config['sample_epoch_num'] == 0 and epoch > 0:
+            probs = retdict["output"][0].detach() # detach()确保不会有梯度流回
+            
+            # 使用模型生成的概率进行采样和评估
+            temp_max, temp_max_info, start_samples_temp, value = sampler(
+                data, tensor_probs, probs, config['num_ls'], change_times, config['total_mcmc_num'])
+            
+            # update now_max (更新历史最佳结果)
+            for i0 in range(config['total_mcmc_num']):
+                if temp_max[i0] > now_max_res[i0]:
+                    now_max_res[i0] = temp_max[i0]
+                    now_max_info[:, i0] = temp_max_info[:, i0]
+
+            # update if min is too small (防止某个探索链的结果过差，用最优结果替换它)
+            now_max = max(now_max_res).item()
+            now_max_index = torch.argmax(now_max_res)
+            now_min_index = torch.argmin(now_max_res)
+            
+            now_max_res[now_min_index] = now_max
+            now_max_info[:, now_min_index] = now_max_info[:, now_max_index].clone() # 使用clone避免引用问题
+            temp_max_info[:, now_min_index] = now_max_info[:, now_max_index].clone()
+
+            # select best samples (选择本次采样中最好的样本作为下一次MCMC链的起点)
+            tensor_probs = temp_max_info.clone()
+            tensor_probs = tensor_probs.repeat(1, config['repeat_times'])
+            # construct the start point for next iteration (为下一次模型迭代准备好输入样本)
+            start_samples = start_samples_temp.t()
+            
+            if verbose:
+                if config["problem_type"] == "maxsat" and len(data.pdata) == 7:
+                    res = max(now_max_res).item()
+                    if res > data.pdata[5] * data.pdata[6]:
+                        res -= data.pdata[5] * data.pdata[6]
+                        print("o {:.3f}".format(res))
+                elif "obj_type" in config and config["obj_type"] == "neg":
+                    print("o {:.3f}".format((-max(now_max_res).item())))
+                else:
+                    print("o {:.3f}".format((max(now_max_res).item())))
+        
+        # 释放显存
+        del (retdict)
+
+    total_max = now_max_res
+    best_sort = torch.argsort(now_max_res, descending=True)
+    total_best_info = torch.squeeze(now_max_info[:, best_sort[0]])
+
+    return max(total_max).item(), total_best_info, now_max_res, now_max_info, net
\ No newline at end of file
diff --git a/src/tutorials/MCPG/model.py b/src/tutorials/MCPG/model.py
new file mode 100644
index 0000000..e296a1b
--- /dev/null
+++ b/src/tutorials/MCPG/model.py
@@ -0,0 +1,178 @@
+# --- model.py ---
+"""
+Copyright (c) 2024 Cheng Chen, Ruitao Chen, Tianyou Li, Zaiwen Wen
+All rights reserved.
+... (copyright text remains the same) ...
+"""
+import torch
+import torch.nn as nn
+import math
+import torch.nn.functional as F
+
+
+# ==============================================================================
+#  原始的 simple 模型 (保持不变)
+# ==============================================================================
+class simple(torch.nn.Module):
+    def __init__(self, output_num):
+        super(simple, self).__init__()
+        self.lin = torch.nn.Linear(1,  output_num)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def reset_parameters(self):
+        self.lin.reset_parameters()
+
+    def forward(self, alpha = 0.1, start_samples=None, value=None, 
+                device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+        # 1. 生成概率: "盲"模型，不看问题结构
+        x = torch.ones(1).to(device)
+        x = self.lin(x)
+        x = self.sigmoid(x)
+
+        x = (x-0.5) * 0.6 + 0.5
+        probs = x.squeeze()
+        
+        retdict = {}
+        reg = probs * torch.log(probs) + (1-probs) * torch.log(1-probs)
+        reg = torch.mean(reg)
+        
+        # 如果是第一次调用，只返回概率和正则项
+        if start_samples is None:
+            retdict["output"] = [probs.squeeze(-1), "hist"]
+            retdict["reg"] = [reg, "sequence"]
+            retdict["loss"] = [alpha * reg, "sequence"]
+            return retdict
+
+        # 2. 计算损失: 使用与求解器交互的反馈
+        res_samples = value.t().detach()
+
+        start_samples_idx = start_samples * \
+            probs + (1 - start_samples) * (1 - probs)
+        log_start_samples_idx = torch.log(start_samples_idx)
+        log_start_samples = log_start_samples_idx.sum(dim=1)
+        loss_ls = torch.mean(log_start_samples * res_samples)
+        loss = loss_ls + alpha * reg
+
+        # 3. 返回包含所有信息的字典
+        retdict["output"] = [probs.squeeze(-1), "hist"]
+        retdict["reg"] = [reg, "sequence"]
+        retdict["loss"] = [loss, "sequence"]
+        return retdict
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+# ==============================================================================
+#  新增的 Transformer 模型
+# ==============================================================================
+class TransformerModel(nn.Module):
+    def __init__(self, nvar, embedding_dim, num_heads, num_layers, dim_feedforward, dropout=0.1):
+        super(TransformerModel, self).__init__()
+        self.nvar = nvar
+        
+        # 1. 节点嵌入层: 为每个变量创建一个可学习的向量
+        self.embedding = nn.Embedding(nvar, embedding_dim)
+        
+        # 2. Transformer 编码器层: 这是模型的核心
+        # 我们使用 TransformerEncoder 因为我们是对所有节点并行处理，而非自回归生成
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=embedding_dim,
+            nhead=num_heads,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            batch_first=True # 使用 (batch, seq, feature) 格式
+        )
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        
+        # 3. 输出层: 将Transformer的输出映射回每个变量的概率
+        self.output_layer = nn.Linear(embedding_dim, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def reset_parameters(self):
+        self.embedding.reset_parameters()
+        self.output_layer.reset_parameters()
+        # 重置Transformer编码器层
+        for layer in self.transformer_encoder.layers:
+            layer.self_attn.in_proj_weight.data.normal_(0, 0.02)
+            layer.self_attn.out_proj.weight.data.normal_(0, 0.02)
+            layer.linear1.weight.data.normal_(0, 0.02)
+            layer.linear2.weight.data.normal_(0, 0.02)
+
+    def _create_attention_mask(self, data, device):
+        # 创建一个基于图邻接关系的注意力掩码
+        # 掩码值为0.0表示允许关注，-inf表示禁止关注
+        adj = torch.full((self.nvar, self.nvar), float('-inf'), device=device)
+        
+        # 允许节点关注自己
+        adj.fill_diagonal_(0.0)
+        
+        # 允许节点关注它的邻居
+        edge_index = data.edge_index
+        adj[edge_index[0], edge_index[1]] = 0.0
+        adj[edge_index[1], edge_index[0]] = 0.0 # 无向图
+        
+        return adj
+
+    def forward(self, data, alpha=0.1, start_samples=None, value=None, 
+                device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+        
+        # ================= Part 1: 生成概率 (升级后的"大脑") =================
+        # 1.1 获取所有节点的初始嵌入
+        node_indices = torch.arange(self.nvar, device=device)
+        x = self.embedding(node_indices).unsqueeze(0) # 增加 batch 维度
+
+        # 1.2 创建注意力掩码，将图结构注入模型
+        # 只在Max-Cut这类图问题上创建掩码
+        attn_mask = None
+        if hasattr(data, 'edge_index'):
+             attn_mask = self._create_attention_mask(data, device)
+
+        # 1.3 通过Transformer进行信息交换
+        transformer_output = self.transformer_encoder(x, mask=attn_mask)
+        
+        # 1.4 生成最终概率
+        logits = self.output_layer(transformer_output).squeeze(0).squeeze(-1)
+        probs = self.sigmoid(logits)
+
+                # ================= Part 2: 计算损失（稳定版 + 形状修复） =================
+        retdict = {}
+
+        # 先准备概率与熵正则（稳定写法）
+        p = torch.sigmoid(logits)  # [nvar]
+        log_p = -F.softplus(-logits)
+        log_q = -F.softplus( logits)
+        entropy = -(p * log_p + (1 - p) * log_q)   # 标准伯努利熵（稳定）
+        reg = -entropy.mean()  # 和你原本 reg 定义一致
+
+        if start_samples is None:
+            retdict["output"] = [p, "hist"]
+            retdict["reg"]    = [reg, "sequence"]
+            retdict["loss"]   = [alpha * reg, "sequence"]
+            return retdict
+
+        # ---- 关键修复：把 logits 扩成 [B, nvar] 与 start_samples 对齐 ----
+        B = start_samples.shape[0]          # [B, nvar]
+        logits_batched = logits.unsqueeze(0).expand(B, -1)  # [B, nvar]
+
+        # 用 BCEWithLogits 计算每个节点的 log-prob（稳定，不会 log(0)）
+        log_p_node = -F.binary_cross_entropy_with_logits(
+            logits_batched, start_samples.float(), reduction='none'
+        )  # [B, nvar]
+
+        log_start_samples = log_p_node.sum(dim=1)                 # [B]
+        log_start_samples = torch.nan_to_num(log_start_samples, neginf=-1e9, posinf=1e9)
+
+        res_samples = value.t().detach()                          # [B]
+        res_samples = torch.nan_to_num(res_samples, nan=0.0, neginf=0.0, posinf=0.0)
+
+        loss_ls = torch.mean(log_start_samples * res_samples)
+        loss = loss_ls + alpha * reg
+
+        retdict["output"] = [p, "hist"]       # 这里仍返回按节点的概率 [nvar]
+        retdict["reg"]    = [reg, "sequence"]
+        retdict["loss"]   = [loss, "sequence"]
+        return retdict
+
+
+    def __repr__(self):
+        return self.__class__.__name__
\ No newline at end of file
diff --git a/src/tutorials/MCPG/sampling.py b/src/tutorials/MCPG/sampling.py
new file mode 100644
index 0000000..7c7f30f
--- /dev/null
+++ b/src/tutorials/MCPG/sampling.py
@@ -0,0 +1,600 @@
+import torch
+from torch_scatter import scatter
+from torch.distributions.bernoulli import Bernoulli
+
+
+def sample_initializer(problem_type, probs, config,
+                       device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), data=None):
+    if problem_type in ["r_cheegercut", "n_cheegercut"]:
+        samples = torch.zeros(config['total_mcmc_num'], data.num_nodes)
+        index = data.sorted_degree_nodes[- config['total_mcmc_num']:]
+        for i in range(config['total_mcmc_num']):
+            samples[i][index[i]] = 1
+        samples = samples.repeat(config['repeat_times'], 1)
+        return samples.t()
+    m = Bernoulli(probs)
+    samples = m.sample([config['total_mcmc_num'] * config['repeat_times']])
+
+    """    if problem_type == "ising":
+        samples = samples * 2 - 1"""
+
+    samples = samples.detach().to(device)
+    return samples.t()
+
+
+def sampler_select(problem_type):
+    if problem_type == "maxcut":
+        return mcpg_sampling_maxcut
+    elif problem_type == "maxcut_edge":
+        return mcpg_sampling_maxcut_edge
+    elif problem_type == "maxsat":
+        return mcpg_sampling_maxsat
+    elif problem_type == "mimo":
+        return mcpg_sampling_mimo
+    elif problem_type == "qubo":
+        return mcpg_sampling_qubo
+    elif problem_type == "qubo_bin":
+        return mcpg_sampling_qubo_bin
+    elif problem_type == "r_cheegercut":
+        return mcpg_sampling_rcheegercut
+    elif problem_type == "n_cheegercut":
+        return mcpg_sampling_ncheegercut
+    elif problem_type == "ising":
+        return mcpg_sampling_ising
+    else:
+        raise (Exception("Unrecognized problem type {}".format(problem_type)))
+
+
+def metro_sampling(probs, start_status, max_transfer_time,
+                   device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    num_node = len(probs)
+    num_chain = start_status.shape[1]
+    index_col = torch.tensor(list(range(num_chain))).to(device)
+
+    probs = probs.detach().to(device)
+    samples = start_status.bool().to(device)
+
+    count = 0
+    for t in range(max_transfer_time * 5):
+        if count >= num_chain*max_transfer_time:
+            break
+        index_row = torch.randint(low=0, high=num_node, size=[
+                                  num_chain], device=device)
+        chosen_probs_base = probs[index_row]
+        chosen_value = samples[index_row, index_col]
+        chosen_probs = torch.where(
+            chosen_value, chosen_probs_base, 1-chosen_probs_base)
+        accept_rate = (1 - chosen_probs) / chosen_probs
+        r = torch.rand(num_chain, device=device)
+        is_accept = (r < accept_rate)
+        samples[index_row, index_col] = torch.where(
+            is_accept, ~chosen_value, chosen_value)
+
+        count += is_accept.sum()
+
+    return samples.float().to(device)
+
+
+import torch
+
+import torch
+from torch_scatter import scatter
+from torch.distributions.bernoulli import Bernoulli
+
+
+def mcpg_sampling_ising(data,
+                              start_result, probs,
+                              num_ls, change_times, total_mcmc_num,
+                              device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    probs = probs.to(torch.device("cpu"))
+    num_edges = data.num_edges
+    edges = data.edge_index
+    nlr_graph = edges[0]
+    nlc_graph = edges[1]
+    edge_weight = data.edge_attr
+    edge_weight_sum = data.edge_weight_sum
+    graph_probs = start_result.clone()
+    # get probs
+    graph_probs = metro_sampling(
+        probs, graph_probs, change_times, device)
+    start = graph_probs.clone()
+
+    temp = graph_probs[data.sorted_degree_nodes[0]].clone()
+    graph_probs += temp
+    graph_probs = graph_probs % 2
+
+    graph_probs = (graph_probs - 0.5) * 2 + 0.5
+
+    # local search
+    temp = torch.zeros(4, graph_probs.size(dim=1)).to(device)
+    expected_cut = torch.zeros(graph_probs.size(dim=1))
+    cnt = 0
+    while True:
+        cnt += 1
+        for i in range(num_edges):
+            index = data.sorted_degree_edges[i]
+            node_r = nlr_graph[index]
+            node_c = nlc_graph[index]
+            edges_r = data.n0_edges[index]
+            edges_c = data.n1_edges[index]
+            add_0 = data.add[0][index]
+            add_1 = data.add[1][index]
+            add_2 = data.add[2][index]
+
+            temp_r_v = torch.mm(edges_r, graph_probs[data.n0[index]])
+            temp_c_v = torch.mm(edges_c, graph_probs[data.n1[index]])
+
+            temp[1] = temp_r_v + torch.rand(graph_probs.size(dim=1), device=device) * 0.1 + add_0
+            temp[2] = temp_c_v + torch.rand(graph_probs.size(dim=1), device=device) * 0.1 + add_1
+            temp[0] = temp[1] + temp[2] + torch.rand(graph_probs.size(dim=1),
+                                                     device=torch.device(device)) * 0.1 - add_2
+
+            max_index = torch.argmax(temp, dim=0)
+            graph_probs[node_r] = torch.floor(max_index / 2)
+            graph_probs[node_c] = max_index % 2
+
+        if cnt >= num_ls:
+            break
+
+    # — Ising energy instead of cut —
+    # spins ∈ {+1,–1}
+    spins = 2 * graph_probs - 1  # [nvar, chains]
+
+    # raw energy per chain: E(s) = - sum J_ij s_i s_j
+    energy_vec = (edge_weight * (spins[nlr_graph] * spins[nlc_graph])).sum(dim=0)  # [chains]
+
+    # full‐chain advantage for policy gradient
+    adv = (energy_vec - energy_vec.mean()).to(device)  # [chains]
+
+    # pick the best chain out of each repeat group
+    energy_reshape = energy_vec.view(-1, total_mcmc_num)  # [repeat_times, total_mcmc_num]
+    idx = torch.argmin(energy_reshape, dim=0)  # [total_mcmc_num]
+    for i0 in range(total_mcmc_num):
+            idx[i0] = i0 + idx[i0] * total_mcmc_num
+    temp_max = energy_vec[idx]  # [total_mcmc_num]
+    temp_max_info = graph_probs[:, idx]  # [nvar, total_mcmc_num]
+    start_samps = start  # [nvar, chains]
+    
+    return temp_max, temp_max_info, start_samps, adv
+
+
+'''def mcpg_sampling_ising(data, tensor_probs, probs, num_ls, change_times, total_mcmc_num):
+    """
+    Metropolis-based sampler for Ising energy:
+        E(s) = - sum_{(i,j)} J_ij * s_i * s_j
+
+    Returns:
+      best_E       : Tensor [num_chains] of best (lowest) energies found
+      best_config  : Tensor [nvar, num_chains] of corresponding spin configs
+      start_samples: Tensor [num_chains, nvar] final samples to seed next epoch
+      final_E      : Tensor [num_chains] of energies of the final samples
+    """
+    # device & initial samples: tensor_probs is [nvar, num_chains]
+    device = data.edge_index.device
+    samples = tensor_probs.t().clone()           # [num_chains, nvar]
+    num_chains, nvar = samples.shape
+
+    # edges & weights
+    u, v = data.edge_index                                 # each [num_edges]
+    w   = getattr(data, "edge_weight", None) or data.edge_attr  # [num_edges]
+
+    # track best so far
+    best_E      = torch.full((num_chains,), float('inf'), device=device)
+    best_config = samples.clone()    # [num_chains, nvar]
+
+    for _ in range(change_times):
+        # --- compute current energy per chain ---
+        x_u = samples[:, u]   # [num_chains, num_edges]
+        x_v = samples[:, v]
+        E_before = -(w * (x_u * x_v)).sum(dim=1)  # [num_chains]
+
+        # --- propose one random flip per chain ---
+        flips = torch.randint(0, nvar, (num_chains,), device=device)
+        idx   = torch.arange(num_chains, device=device)
+
+        # build proposed configurations
+        samples_prop = samples.clone()
+        samples_prop[idx, flips] *= -1
+
+        # --- compute proposed energy ---
+        x_u_prop = samples_prop[:, u]
+        x_v_prop = samples_prop[:, v]
+        E_after  = -(w * (x_u_prop * x_v_prop)).sum(dim=1)
+
+        # --- Metropolis accept/reject ---
+        delta = E_after - E_before
+        prob_accept = torch.exp(-delta)
+        accept = (E_after <= E_before) | (torch.rand(num_chains, device=device) < prob_accept)
+        acc_idx = torch.nonzero(accept, as_tuple=True)[0]
+        samples[acc_idx, flips[acc_idx]] *= -1
+
+        # --- update best if improved ---
+        better = E_after < best_E
+        best_E[better] = E_after[better]
+        best_config[better] = samples[better]
+
+    # (optional) you could insert your local‐search num_ls loops here
+
+    # final energies of each chain
+    x_u_final = samples[:, u]
+    x_v_final = samples[:, v]
+    print(w.shape)
+    print(x_u_final.shape)
+    print(x_v_final.shape)
+    final_E = -(w * (x_u_final * x_v_final)).sum(dim=1)
+
+    # return in the order expected by mcpg_solver:
+    #   temp_max (best_E), temp_max_info (best_config transposed),
+    #   start_samples (samples), value (final_E)
+    return (
+        best_E,                     # [num_chains]
+        best_config.t().contiguous(),  # [nvar, num_chains]
+        samples,                    # [num_chains, nvar]
+        final_E                     # [num_chains]
+    )
+
+'''
+
+def mcpg_sampling_maxcut(data,
+                         start_result, probs,
+                         num_ls, change_times, total_mcmc_num,
+                         device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    probs = probs.to(torch.device("cpu"))
+    num_nodes = data.num_nodes
+    edges = data.edge_index
+    nlr_graph = edges[0]
+    nlc_graph = edges[1]
+    edge_weight = data.edge_attr
+    edge_weight_sum = data.edge_weight_sum
+    graph_probs = start_result.clone()
+    # get probs
+    graph_probs = metro_sampling(
+        probs, graph_probs, change_times)
+    start = graph_probs.clone()
+
+    temp = graph_probs[data.sorted_degree_nodes[0]].clone()
+    graph_probs += temp
+    graph_probs = graph_probs % 2
+
+    graph_probs = (graph_probs-0.5)*2+0.5
+
+    # local search
+    expected_cut = torch.zeros(graph_probs.size(dim=1))
+    cnt = 0
+    while True:
+        cnt += 1
+        for node_index in range(0, num_nodes):
+            node = data.sorted_degree_nodes[node_index]
+            neighbor_index = data.neighbors[node]
+            neighbor_edge_weight = data.neighbor_edges[node]
+            node_temp_v = torch.mm(
+                neighbor_edge_weight, graph_probs[neighbor_index])
+            node_temp_v = torch.squeeze(node_temp_v)
+            node_temp_v += torch.rand(node_temp_v.shape[0],
+                                      device=torch.device(device))/4
+            graph_probs[node] = (node_temp_v <
+                                 data.weighted_degree[node]/2+0.125).int()
+        if cnt >= num_ls:
+            break
+
+    # maxcut
+
+    expected_cut[:] = ((2 * graph_probs[nlr_graph.type(torch.long)][:] - 1)*(
+        2 * graph_probs[nlc_graph.type(torch.long)][:] - 1) * edge_weight).sum(dim=0)
+
+    expected_cut_reshape = torch.reshape(expected_cut, (-1, total_mcmc_num))
+    index = torch.argmin(expected_cut_reshape, dim=0)
+    for i0 in range(total_mcmc_num):
+        index[i0] = i0 + index[i0]*total_mcmc_num
+    max_cut = expected_cut[index]
+    return ((edge_weight_sum-max_cut) / 2), graph_probs[:, index], start, (expected_cut-torch.mean(expected_cut)).to(device)
+
+def mcpg_sampling_maxcut_edge(data,
+                         start_result, probs,
+                         num_ls, change_times, total_mcmc_num,
+                         device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    probs = probs.to(torch.device("cpu"))
+    num_edges = data.num_edges
+    edges = data.edge_index
+    nlr_graph = edges[0]
+    nlc_graph = edges[1]
+    edge_weight = data.edge_attr
+    edge_weight_sum = data.edge_weight_sum
+    graph_probs = start_result.clone()
+    # get probs
+    graph_probs = metro_sampling(
+        probs, graph_probs, change_times, device)
+    start = graph_probs.clone()
+    
+    temp = graph_probs[data.sorted_degree_nodes[0]].clone()
+    graph_probs += temp
+    graph_probs = graph_probs%2
+    
+    graph_probs = (graph_probs-0.5)*2+0.5
+
+    # local search
+    temp = torch.zeros(4,graph_probs.size(dim=1)).to(device)
+    expected_cut = torch.zeros(graph_probs.size(dim=1))
+    cnt = 0
+    while True:
+        cnt += 1
+        for i in range(num_edges):
+
+            index = data.sorted_degree_edges[i]
+            node_r = nlr_graph[index]
+            node_c = nlc_graph[index]
+            edges_r = data.n0_edges[index]
+            edges_c = data.n1_edges[index]
+            add_0 = data.add[0][index]
+            add_1 = data.add[1][index]
+            add_2 = data.add[2][index]
+
+            temp_r_v = torch.mm(edges_r,graph_probs[data.n0[index]]) 
+            temp_c_v = torch.mm(edges_c,graph_probs[data.n1[index]]) 
+
+            temp[1] = temp_r_v +torch.rand(graph_probs.size(dim=1),device=torch.device('cuda:0'))*0.1 + add_0
+            temp[2] = temp_c_v +torch.rand(graph_probs.size(dim=1),device=torch.device('cuda:0'))*0.1 + add_1
+            temp[0] = temp[1] + temp[2] +torch.rand(graph_probs.size(dim=1),device=torch.device('cuda:0'))*0.1 - add_2
+
+            max_index = torch.argmax(temp, dim=0)
+            graph_probs[node_r] = torch.floor(max_index/2)
+            graph_probs[node_c] = max_index%2
+
+
+        if cnt >= num_ls:
+            break
+
+    # maxcut
+    expected_cut[:] = ((2 * graph_probs[nlr_graph.type(torch.long)][:] - 1)*(
+        2 * graph_probs[nlc_graph.type(torch.long)][:] - 1) * edge_weight).sum(dim=0)
+    
+    expected_cut_reshape = torch.reshape(expected_cut, (-1, total_mcmc_num))
+    index = torch.argmin(expected_cut_reshape, dim=0)
+    for i0 in range(total_mcmc_num):
+        index[i0] = i0 + index[i0]*total_mcmc_num
+    max_cut = expected_cut[index]
+
+    return ((edge_weight_sum-max_cut) / 2), graph_probs[:, index], start, (expected_cut-torch.mean(expected_cut)).to(device)
+
+
+
+def mcpg_sampling_rcheegercut(data,
+                              start_result, probs,
+                              num_ls, change_times, total_mcmc_num,
+                              device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    probs = probs.to(torch.device("cpu"))
+    nvar = data.num_nodes
+    edges = data.edge_index
+    nlr_graph, nlc_graph = edges
+    graph_probs = start_result.clone()
+    # get probs
+    raw_samples = metro_sampling(
+        probs, graph_probs, change_times)
+    samples = raw_samples.clone()
+
+    res_cut = ((2 * samples[nlr_graph.type(torch.long)][:] - 1)*(
+        2 * samples[nlc_graph.type(torch.long)][:] - 1)).sum(dim=0)
+    res_cut[:] = (data.edge_weight_sum - res_cut) / 2
+    res_node = samples.sum(dim=0)
+    cheeger_cut = res_cut / torch.min(res_node, nvar - res_node)
+
+    for cnt in range(num_ls):
+        for node_index in range(nvar):
+            node = data.sorted_degree_nodes[node_index]
+            neighbor_index = data.neighbors[node]
+
+            change_cut_size = torch.sum(samples[neighbor_index], dim=0)
+            new_res_cut = res_cut - \
+                (2 * samples[node] - 1) * \
+                (data.weighted_degree[node] - 2 * change_cut_size)
+            new_res_node = res_node - (2 * samples[node] - 1)
+            new_cheeger_cut = new_res_cut / \
+                torch.min(new_res_node, nvar - new_res_node)
+            new_min_node = torch.min(new_res_node, nvar - new_res_node)
+            cond = torch.logical_or(
+                (cheeger_cut < new_cheeger_cut), (new_min_node < 0.0000001))
+            samples[node] = torch.where(cond, samples[node], 1 - samples[node])
+            res_cut = torch.where(cond, res_cut, new_res_cut)
+            res_node = torch.where(cond, res_node, new_res_node)
+            cheeger_cut = torch.where(cond, cheeger_cut, new_cheeger_cut)
+    # maxcut
+    cheeger_cut_reshape = torch.reshape(cheeger_cut, (-1, total_mcmc_num))
+    index = torch.argmin(cheeger_cut_reshape, dim=0)
+    for i0 in range(total_mcmc_num):
+        index[i0] = i0 + index[i0]*total_mcmc_num
+    min_cheeger_cut = cheeger_cut[index]
+
+    return -min_cheeger_cut, samples[:, index], raw_samples, (cheeger_cut-torch.mean(cheeger_cut)).to(device)
+
+
+def mcpg_sampling_ncheegercut(data,
+                              start_result, probs,
+                              num_ls, change_times, total_mcmc_num,
+                              device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    probs = probs.to(torch.device("cpu"))
+    nvar = data.num_nodes
+    edges = data.edge_index
+    nlr_graph, nlc_graph = edges
+    graph_probs = start_result.clone()
+    # get probs
+    raw_samples = metro_sampling(
+        probs, graph_probs, change_times)
+    samples = raw_samples.clone()
+
+    res_cut = ((2 * samples[nlr_graph.type(torch.long)][:] - 1)*(
+        2 * samples[nlc_graph.type(torch.long)][:] - 1)).sum(dim=0)
+    res_cut[:] = (data.edge_weight_sum - res_cut) / 2
+    res_node = samples.sum(dim=0)
+    cheeger_cut = res_cut * (1 / res_node + 1 / (nvar - res_node))
+
+    for cnt in range(num_ls):
+        for node_index in range(nvar):
+            node = data.sorted_degree_nodes[node_index]
+            neighbor_index = data.neighbors[node]
+
+            change_cut_size = torch.sum(samples[neighbor_index], dim=0)
+            new_res_cut = res_cut - \
+                (2 * samples[node] - 1) * \
+                (data.weighted_degree[node] - 2 * change_cut_size)
+            new_res_node = res_node - (2 * samples[node] - 1)
+            new_cheeger_cut = new_res_cut * \
+                (1 / new_res_node + 1 / (nvar - new_res_node))
+            new_min_node = torch.min(new_res_node, nvar - new_res_node)
+            cond = torch.logical_or(
+                (cheeger_cut < new_cheeger_cut), (new_min_node < 0.0000001))
+            samples[node] = torch.where(cond, samples[node], 1 - samples[node])
+            res_cut = torch.where(cond, res_cut, new_res_cut)
+            res_node = torch.where(cond, res_node, new_res_node)
+            cheeger_cut = torch.where(cond, cheeger_cut, new_cheeger_cut)
+    # maxcut
+    cheeger_cut_reshape = torch.reshape(cheeger_cut, (-1, total_mcmc_num))
+    index = torch.argmin(cheeger_cut_reshape, dim=0)
+    for i0 in range(total_mcmc_num):
+        index[i0] = i0 + index[i0]*total_mcmc_num
+    min_cheeger_cut = cheeger_cut[index]
+
+    return -min_cheeger_cut, samples[:, index], raw_samples, (cheeger_cut-torch.mean(cheeger_cut)).to(device)
+
+
+def mcpg_sampling_maxsat(
+        data,
+        start_result, probs,
+        num_ls, change_times, total_mcmc_num,
+        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+
+    nvar, nclause, vi, ci, neg = data.pdata[0:5]
+    raw_samples = start_result.clone()
+    raw_samples = metro_sampling(
+        probs, raw_samples, change_times, device)
+    samples = raw_samples.t().clone()
+    samples = samples * 2 - 1
+    for cnt in range(num_ls):
+        for index in range(nvar):
+            i = data.ndata[3][index].item()
+            cal_sample = samples[:, data.ndata[0][i]] * data.ndata[2][i]
+            res_clause = scatter(
+                cal_sample, data.ndata[1][i], reduce="max", dim=1)
+            res_sample_old = torch.sum(res_clause, dim=1)
+            samples[:, i] = - samples[:, i]
+            cal_sample = samples[:, data.ndata[0][i]] * data.ndata[2][i]
+            res_clause = scatter(
+                cal_sample, data.ndata[1][i], reduce="max", dim=1)
+            res_sample_new = torch.sum(res_clause, dim=1)
+            # ind = (res_sample_new > res_sample_old)
+            ind = (res_sample_new > res_sample_old +
+                   torch.rand(res_sample_old.shape[0], device=device)-0.5)
+            samples[:, i] = torch.where(ind, samples[:, i], -samples[:, i])
+
+    cal_sample = samples[:, vi] * neg
+    res_clause = scatter(cal_sample, ci, reduce="max", dim=1)
+    res_sample = torch.sum(res_clause, dim=1)
+    if len(data.pdata) == 7:
+        res_sample += nclause-data.pdata[6]+data.pdata[5]*data.pdata[6]
+    else:
+        res_sample += nclause
+    res_sample = res_sample/2
+
+    res_sample_reshape = torch.reshape(res_sample, (-1, total_mcmc_num))
+    index = torch.argmax(res_sample_reshape, dim=0)
+    index = torch.tensor(list(range(total_mcmc_num)),
+                         device=device) + index*total_mcmc_num
+    max_res = res_sample[index]
+    samples = (samples + 1) / 2
+    return max_res, samples.t()[:, index], raw_samples, -(res_sample - torch.mean(res_sample.float())).to(device)
+
+
+def mcpg_sampling_mimo(data,
+              start_result, probs,
+              num_ls, change_times, total_mcmc_num,
+              device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    Sigma = data[0]
+    Diag = data[1]
+    num_n = data[0].shape[0]
+    info = start_result.clone()
+    # get probs
+    info = metro_sampling(
+        probs, info, change_times, device)
+    start = info.clone()
+
+    info = (info-0.5)*4  # convert to 2, -2
+
+    # local search
+    cnt = 0
+    while True:
+        for node in range(0, num_n):
+            if cnt >= num_ls*num_n:
+                break
+            cnt += 1
+            neighbor_weight = Sigma[node].unsqueeze(0)
+            node_temp_v = torch.matmul(
+                neighbor_weight, info)
+            node_temp_v = torch.squeeze(node_temp_v)
+            temp = (node_temp_v < - Diag[node]/2).int()
+            info[node] = temp*2-1
+        if cnt >= num_ls*num_n:
+            break
+
+    # compute value
+    expected = (info * torch.matmul(Sigma, info)).sum(dim=0)
+    expected += torch.matmul(Diag.unsqueeze(0), info).sum(dim=0)
+    expected_cut_reshape = torch.reshape(expected, (-1, total_mcmc_num))
+    index = torch.argmin(expected_cut_reshape, dim=0)
+    for i0 in range(total_mcmc_num):
+        index[i0] = i0 + index[i0]*total_mcmc_num
+    min_res = expected[index]
+    info = (info+1)/2
+    return -(min_res+data[3]), info[:, index], start, (expected-torch.mean(expected)).to(device)
+
+
+def mcpg_sampling_qubo(data, start_result, probs, num_ls, change_times, total_mcmc_num,
+                       device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    Q = data['Q']
+    nvar = data['nvar']
+    raw_samples = start_result.clone()
+    # get probs
+    raw_samples = metro_sampling(
+        probs, raw_samples, change_times, device)
+    samples = raw_samples.clone()
+    samples = samples * 2 - 1
+    # local search
+    for cnt in range(num_ls):
+        for index in range(nvar):
+            samples[index] = 0
+            res = torch.matmul(Q[index], samples)
+            ind = (res > 0)
+            samples[index] = 2 * ind - 1
+    # compute value
+    res_sample = torch.matmul(Q, samples)
+    res_sample = torch.sum(torch.mul(samples, res_sample), dim=0)
+    res_sample_reshape = torch.reshape(res_sample, (-1, total_mcmc_num))
+    index = torch.argmax(res_sample_reshape, dim=0)
+    index = torch.tensor(list(range(total_mcmc_num)),
+                         device=device) + index*total_mcmc_num
+    max_res = res_sample[index]
+    samples = (samples + 1) / 2
+    return max_res, samples[:, index], raw_samples, -(res_sample - torch.mean(res_sample.float())).to(device)
+def mcpg_sampling_qubo_bin(data, start_result, probs, num_ls, change_times, total_mcmc_num,
+                       device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
+    Q = data['Q']
+    nvar = data['nvar']
+    raw_samples = start_result.clone()
+    # get probs
+    raw_samples = metro_sampling(
+        probs, raw_samples, change_times, device)
+    samples = raw_samples.clone()
+    # local search
+    for cnt in range(num_ls):
+        for index in range(nvar):
+            samples[index] = 0
+            res = torch.matmul(Q[index], samples)
+            ind = (res > (- Q[index][index]/2))
+            samples[index] = ind
+    # compute value
+    res_sample = torch.matmul(Q, samples)
+    res_sample = torch.sum(torch.mul(samples, res_sample), dim=0)
+    res_sample_reshape = torch.reshape(res_sample, (-1, total_mcmc_num))
+    index = torch.argmax(res_sample_reshape, dim=0)
+    index = torch.tensor(list(range(total_mcmc_num)),
+                         device=device) + index*total_mcmc_num
+    max_res = res_sample[index]
+    return max_res, samples[:, index], raw_samples, -(res_sample - torch.mean(res_sample.float())).to(device)
diff --git a/src/tutorials/MCPG/simulation.py b/src/tutorials/MCPG/simulation.py
new file mode 100644
index 0000000..de6e41e
--- /dev/null
+++ b/src/tutorials/MCPG/simulation.py
@@ -0,0 +1,133 @@
+"""
+Copyright (c) 2024 Cheng Chen, Ruitao Chen, Tianyou Li, Zaiwen Wen
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+   following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+   and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+import time
+import yaml
+import argparse
+from dataloader import read_data_mimo
+from mcpg_solver import mcpg_solver
+
+H_num = 10
+X_num = 10
+num_rand = H_num * X_num
+
+parser = argparse.ArgumentParser()
+parser.add_argument("snr", type=int,
+                    help="input the SNR for the simulation.")
+parser.add_argument("size", type=int,
+                    help="input the problem size which appoint the dataset.")
+args = parser.parse_args()
+with open("../config/mimo_default.yaml") as f:
+    config = yaml.safe_load(f)
+
+def get_parameter(SNR, N, K, config):
+    config["num_ls"] = 4
+    num_epochs = 3
+    max_range = 150
+    if N == 800 and K == 800:
+        if SNR == 2:
+            config["num_ls"] = 3
+            num_epochs = 1
+        elif SNR == 4:
+            config["num_ls"] = 3
+            num_epochs = 2
+        elif SNR == 6:
+            config["num_ls"] = 5
+            num_epochs = 5
+        elif SNR == 8:
+            config["num_ls"] = 7
+            num_epochs = 3
+        elif SNR == 10:
+            config["num_ls"] = 5
+            num_epochs = 2
+        elif SNR == 12:
+            config["num_ls"] = 4
+            num_epochs = 2
+    if N == 1000 and K == 1000:
+        if SNR == 2:
+            config["num_ls"] = 2
+            num_epochs = 2
+        elif SNR == 4:
+            config["num_ls"] = 3
+            num_epochs = 2
+        elif SNR == 6:
+            config["num_ls"] = 7
+            num_epochs = 4
+        elif SNR == 8:
+            config["num_ls"] = 8
+            num_epochs = 4
+        elif SNR == 10:
+            config["num_ls"] = 7
+            num_epochs = 2
+        elif SNR == 12:
+            config["num_ls"] = 3
+            num_epochs = 3
+    if N == 1200 and K == 1200:
+        if SNR == 2:
+            config["num_ls"] = 3
+            num_epochs = 1
+        elif SNR == 4:
+            config["num_ls"] = 3
+            num_epochs = 2
+        elif SNR == 6:
+            config["num_ls"] = 7
+            num_epochs = 3
+        elif SNR == 8:
+            config["num_ls"] = 6
+            num_epochs = 4
+            max_range = 100
+        elif SNR == 10:
+            config["num_ls"] = 4
+            num_epochs = 4
+        elif SNR == 12:
+            config["num_ls"] = 5
+            num_epochs = 2
+    config["max_epoch_num"] = num_epochs * config["sample_epoch_num"]
+    return config, max_range
+
+num_nodes = 2*args.size
+max_range = 150
+config["num_ls"] = 4  # local search epoch
+config, max_range = get_parameter(args.snr, args.size, args.size, config)  # get paramter
+
+rand_seeds = list(range(0, num_rand))
+
+record = []
+total_time = 0
+
+for r_seed in rand_seeds:
+
+    data = read_data_mimo(args.size, args.size, args.snr, X_num, r_seed)
+
+    total_start_time = time.perf_counter()
+    _, _, now_best, now_best_info = mcpg_solver(num_nodes, config, data)
+    now_best_info = now_best_info * 2 - 1
+    total_end_time = time.perf_counter()
+    total_time += total_end_time - total_start_time
+    # get average information
+    best_sort = torch.argsort(now_best, descending=True)
+    total_best_info = torch.squeeze(now_best_info[:, best_sort[0]])
+    for i0 in range(max_range):
+        total_best_info += torch.squeeze(now_best_info[:, best_sort[i0]])
+    total_best_info = torch.sign(total_best_info)
+    record.append((total_best_info != data[2]).sum().item()/num_nodes)
+
+print("SNR = {} SIZE = {}".format(args.snr, args.size))
+print("BEST: ", min(record))
+print("MEAN: ", sum(record)/num_rand)
+print("AVERAGE TIME: ", total_time/num_rand)
diff --git a/src/tutorials/eco_dqn.ipynb b/src/tutorials/eco_dqn.ipynb
new file mode 100644
index 0000000..685a650
--- /dev/null
+++ b/src/tutorials/eco_dqn.ipynb
@@ -0,0 +1,33 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ddcb1a41-b9f2-4c5c-bfeb-cd102d2e97c3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (maxcut)",
+   "language": "python",
+   "name": "maxcut"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}