Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions minda/decompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import numpy as np
import gzip
import re
from collections import Counter
from pybedtools import BedTool

Expand Down Expand Up @@ -170,6 +171,13 @@ def _get_alt_mate_index(df):
return df


def _replace_bnd_mate_locus(alt, mate_chrom, mate_pos):
alt = str(alt)
pattern = r'([][])([^:\[\]]+):(\d+)([][])'
replacement = rf'\1{mate_chrom}:{mate_pos}\4'
return re.sub(pattern, replacement, alt, count=1)


def _get_paired_alt_dfs(alt_df):

# check if BNDS are a single record or two
Expand All @@ -182,6 +190,12 @@ def _get_paired_alt_dfs(alt_df):
alt_df_2 = alt_df.copy()
alt_df_2['#CHROM'] = alt_df_2.ALT.str.extract(r'(chr\w+|\w+):')[0].to_list()
alt_df_2['POS'] = alt_df_2.ALT.str.extract(r':(\d+)')[0].astype(pd.Int64Dtype()).to_list()
# Build reciprocal ALT for the synthetic mate-side row so each side
# points to the opposite breakpoint locus.
alt_df_2['ALT'] = [
_replace_bnd_mate_locus(alt, chrom, pos)
for alt, chrom, pos in zip(alt_df_2['ALT'], alt_df_1['#CHROM'], alt_df_1['POS'])
]
paired_alt_dfs = [alt_df_1, alt_df_2]
logger.debug(f"(1) Number of alt/alt_1/alt_2 records: {alt_df.shape[0]} {alt_df_1.shape[0]} {alt_df_2.shape[0]}")
logger.info(f"Number of paired records paired by ALT column: {alt_df_1.shape[0]} {alt_df_2.shape[0]}")
Expand Down Expand Up @@ -272,6 +286,7 @@ def _get_paired_info_dfs(info_df):


def _check_df_order(df_1, df_2):
payload_columns = df_1.columns

# row by row for start and end df, check that the order by sorting
for i in range(len(df_1)):
Expand All @@ -287,12 +302,11 @@ def _check_df_order(df_1, df_2):
order_df = pd.concat([row_1, row_2]).reset_index(drop=True)
sorted_order_df = _get_sorted_df(order_df)

# if sort is out of order, what the chrom & pos values of the start & end dfs
if order_df.equals(sorted_order_df) == False:
df_1.at[i,'#CHROM'] = sorted_order_df.iloc[0]['#CHROM']
df_1.at[i, 'POS'] = sorted_order_df.iloc[0]['POS']
df_2.at[i,'#CHROM'] = sorted_order_df.iloc[1]['#CHROM']
df_2.at[i, 'POS'] = sorted_order_df.iloc[1]['POS']
# If out of order, copy full payloads (not only CHROM/POS) so
# ALT/INFO/ID stay consistent with the selected breakpoint side.
if order_df.equals(sorted_order_df) == False:
df_1.loc[df_1.index[i], payload_columns] = sorted_order_df.loc[0, payload_columns].values
df_2.loc[df_2.index[i], payload_columns] = sorted_order_df.loc[1, payload_columns].values

return df_1, df_2

Expand Down