From a6a97ec95e326d8f79f980f3f53259c823fa6235 Mon Sep 17 00:00:00 2001 From: Kai Horny <63246519+sci-kai@users.noreply.github.com> Date: Fri, 8 May 2026 15:48:48 +0200 Subject: [PATCH 1/2] Fixed singleton BND alt fields important especially for ensemble VCF creation --- minda/decompose.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/minda/decompose.py b/minda/decompose.py index a1740f6..e9693a2 100644 --- a/minda/decompose.py +++ b/minda/decompose.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np import gzip +import re from collections import Counter from pybedtools import BedTool @@ -170,6 +171,13 @@ def _get_alt_mate_index(df): return df +def _replace_bnd_mate_locus(alt, mate_chrom, mate_pos): + alt = str(alt) + pattern = r'([][])([^:\[\]]+):(\d+)([][])' + replacement = rf'\1{mate_chrom}:{mate_pos}\4' + return re.sub(pattern, replacement, alt, count=1) + + def _get_paired_alt_dfs(alt_df): # check if BNDS are a single record or two @@ -182,6 +190,12 @@ def _get_paired_alt_dfs(alt_df): alt_df_2 = alt_df.copy() alt_df_2['#CHROM'] = alt_df_2.ALT.str.extract(r'(chr\w+|\w+):')[0].to_list() alt_df_2['POS'] = alt_df_2.ALT.str.extract(r':(\d+)')[0].astype(pd.Int64Dtype()).to_list() + # Build reciprocal ALT for the synthetic mate-side row so each side + # points to the opposite breakpoint locus. + alt_df_2['ALT'] = [ + _replace_bnd_mate_locus(alt, chrom, pos) + for alt, chrom, pos in zip(alt_df_2['ALT'], alt_df_1['#CHROM'], alt_df_1['POS']) + ] paired_alt_dfs = [alt_df_1, alt_df_2] logger.debug(f"(1) Number of alt/alt_1/alt_2 records: {alt_df.shape[0]} {alt_df_1.shape[0]} {alt_df_2.shape[0]}") logger.info(f"Number of paired records paired by ALT column: {alt_df_1.shape[0]} {alt_df_2.shape[0]}") From dc5d575de929babedd5b6bfe0e1eaf03f5c53477 Mon Sep 17 00:00:00 2001 From: Kai Horny <63246519+sci-kai@users.noreply.github.com> Date: Fri, 8 May 2026 15:49:47 +0200 Subject: [PATCH 2/2] Changed after ordering the breakpoint start/ends that all columns are switched, not just CHROM POS (including ALT field) --- minda/decompose.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/minda/decompose.py b/minda/decompose.py index e9693a2..eff6205 100644 --- a/minda/decompose.py +++ b/minda/decompose.py @@ -286,6 +286,7 @@ def _get_paired_info_dfs(info_df): def _check_df_order(df_1, df_2): + payload_columns = df_1.columns # row by row for start and end df, check that the order by sorting for i in range(len(df_1)): @@ -301,12 +302,11 @@ def _check_df_order(df_1, df_2): order_df = pd.concat([row_1, row_2]).reset_index(drop=True) sorted_order_df = _get_sorted_df(order_df) - # if sort is out of order, what the chrom & pos values of the start & end dfs - if order_df.equals(sorted_order_df) == False: - df_1.at[i,'#CHROM'] = sorted_order_df.iloc[0]['#CHROM'] - df_1.at[i, 'POS'] = sorted_order_df.iloc[0]['POS'] - df_2.at[i,'#CHROM'] = sorted_order_df.iloc[1]['#CHROM'] - df_2.at[i, 'POS'] = sorted_order_df.iloc[1]['POS'] + # If out of order, copy full payloads (not only CHROM/POS) so + # ALT/INFO/ID stay consistent with the selected breakpoint side. + if order_df.equals(sorted_order_df) == False: + df_1.loc[df_1.index[i], payload_columns] = sorted_order_df.loc[0, payload_columns].values + df_2.loc[df_2.index[i], payload_columns] = sorted_order_df.loc[1, payload_columns].values return df_1, df_2