Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 247 additions & 0 deletions deeptools/computeMatrixOperations.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ def parse_arguments():
or
computeMatrixOperations dataRange -h

or
computeMatrixOperationsExtended arithmetic -h

or
computeMatrixOperationsExtended mean -h

""",
epilog='example usages:\n'
'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n'
Expand Down Expand Up @@ -137,6 +143,22 @@ def parse_arguments():
help='Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.',
usage='Example usage:\n computeMatrixOperations dataRange -m input.mat.gz\n\n')

# arithmetic
subparsers.add_parser(
'arithmetic',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=[arithmeticArgs()],
help='Perform arithmetic operations between two matrices with identical dimensions.',
usage='Example usage:\n computeMatrixOperationsExtended arithmetic -m input1.mat.gz input2.mat.gz -o output.mat.gz --operation ratio --pseudocount 1\n\n')

# mean
subparsers.add_parser(
'mean',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=[meanArgs()],
help='Calculate element-wise mean across two or more matrices with identical dimensions.',
usage='Example usage:\n computeMatrixOperationsExtended mean -m input1.mat.gz input2.mat.gz input3.mat.gz -o output.mat.gz\n\n')

parser.add_argument('--version', action='version',
version='%(prog)s {}'.format(version('deeptools')))

Expand Down Expand Up @@ -250,6 +272,59 @@ def filterValuesArgs():
return parser


def arithmeticArgs():
parser = argparse.ArgumentParser(add_help=False)
required = parser.add_argument_group('Required arguments')

required.add_argument('--matrixFile', '-m',
help='Two matrix files from the computeMatrix tool.',
nargs=2,
required=True)

required.add_argument('--outFileName', '-o',
help='Output file name',
required=True)

optional = parser.add_argument_group('Optional arguments')

optional.add_argument('--operation',
choices=['ratio', 'log2ratio', 'subtract', 'add', 'divide', 'multiply'],
default='ratio',
help='Arithmetic operation to perform. ratio=(data1+pseudocount)/(data2+pseudocount), '
'log2ratio=log2((data1+pseudocount)/(data2+pseudocount)), '
'subtract=data1-data2, add=data1+data2, divide=data1/data2, '
'multiply=data1*data2 (Default: %(default)s)')

optional.add_argument('--pseudocount',
type=float,
default=1.0,
help='Pseudocount to add to avoid division by zero. '
'Only used for ratio and log2ratio operations. '
'Set to 0 to disable. (Default: %(default)s)')

optional.add_argument('--no-pseudocount',
action='store_true',
help='Disable pseudocount (equivalent to --pseudocount 0)')

return parser


def meanArgs():
parser = argparse.ArgumentParser(add_help=False)
required = parser.add_argument_group('Required arguments')

required.add_argument('--matrixFile', '-m',
help='Two or more matrix files from the computeMatrix tool.',
nargs='+',
required=True)

required.add_argument('--outFileName', '-o',
help='Output file name',
required=True)

return parser


def sortArgs():
parser = argparse.ArgumentParser(add_help=False)
required = parser.add_argument_group('Required arguments')
Expand Down Expand Up @@ -688,6 +763,174 @@ def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_design
regions[labelIdx][name] = len(regions[labelIdx])


def arithmeticMatrices(args):
"""
Perform arithmetic operations between two matrices.
Both matrices must have identical dimensions.
"""
# Load both matrices
hm1 = heatmapper.heatmapper()
hm2 = heatmapper.heatmapper()

hm1.read_matrix_file(args.matrixFile[0])
hm2.read_matrix_file(args.matrixFile[1])

# Check that matrices have the same dimensions
if hm1.matrix.matrix.shape != hm2.matrix.matrix.shape:
sys.exit("Error: Matrices have different dimensions. "
"Matrix 1: {}, Matrix 2: {}\n".format(
hm1.matrix.matrix.shape, hm2.matrix.matrix.shape))

# Check that sample numbers and boundaries match
if len(hm1.matrix.sample_labels) != len(hm2.matrix.sample_labels):
sys.exit("Error: Matrices have different numbers of samples. "
"Matrix 1: {}, Matrix 2: {}\n".format(
len(hm1.matrix.sample_labels), len(hm2.matrix.sample_labels)))

if hm1.matrix.sample_boundaries != hm2.matrix.sample_boundaries:
sys.exit("Error: Matrices have different sample boundaries.\n")

# Check that group boundaries match
if hm1.matrix.group_boundaries != hm2.matrix.group_boundaries:
sys.exit("Error: Matrices have different group boundaries.\n")

# Determine pseudocount
pseudocount = 0.0 if args.no_pseudocount else args.pseudocount

# Perform the arithmetic operation
data1 = hm1.matrix.matrix
data2 = hm2.matrix.matrix

# Handle NaN values by preserving them in the output
nan_mask = np.isnan(data1) | np.isnan(data2)

if args.operation == 'ratio':
if pseudocount == 0:
# Avoid division by zero
with np.errstate(divide='ignore', invalid='ignore'):
result = data1 / data2
else:
result = (data1 + pseudocount) / (data2 + pseudocount)

elif args.operation == 'log2ratio':
if pseudocount == 0:
# Avoid division by zero and log of zero/negative
with np.errstate(divide='ignore', invalid='ignore'):
result = np.log2(data1 / data2)
else:
with np.errstate(invalid='ignore'):
result = np.log2((data1 + pseudocount) / (data2 + pseudocount))

elif args.operation == 'subtract':
result = data1 - data2

elif args.operation == 'add':
result = data1 + data2

elif args.operation == 'divide':
with np.errstate(divide='ignore', invalid='ignore'):
result = data1 / data2

elif args.operation == 'multiply':
result = data1 * data2

else:
sys.exit("Error: Unknown operation '{}'\n".format(args.operation))

# Preserve NaN values from original matrices
result[nan_mask] = np.nan

# Replace the matrix in hm1 with the result
hm1.matrix.matrix = result

# Update the sample labels to indicate the operation
operation_label = args.operation
if args.operation in ['ratio', 'log2ratio'] and pseudocount != 0:
operation_label += "_pc{}".format(pseudocount)

# Keep the original sample labels but could optionally modify them
# to indicate the operation performed

# Save the result
hm1.save_matrix(args.outFileName)

# Print summary
print("Arithmetic operation completed:")
print(" Operation: {}".format(args.operation))
if args.operation in ['ratio', 'log2ratio']:
print(" Pseudocount: {}".format(pseudocount))
print(" Matrix 1: {}".format(args.matrixFile[0]))
print(" Matrix 2: {}".format(args.matrixFile[1]))
print(" Output: {}".format(args.outFileName))
print(" Matrix dimensions: {}".format(result.shape))
print(" Number of NaN values: {}".format(np.sum(np.isnan(result))))


def meanMatrices(args):
"""
Calculate the element-wise mean across two or more matrices.
All matrices must have identical dimensions.
"""
if len(args.matrixFile) < 2:
sys.exit("Error: At least two matrix files are required for mean calculation.\n")

# Load all matrices
matrices = []
for matrix_file in args.matrixFile:
hm = heatmapper.heatmapper()
hm.read_matrix_file(matrix_file)
matrices.append(hm)

# Use the first matrix as reference
ref_matrix = matrices[0]
ref_shape = ref_matrix.matrix.matrix.shape

# Check that all matrices have the same dimensions
for idx, hm in enumerate(matrices[1:], start=1):
if hm.matrix.matrix.shape != ref_shape:
sys.exit("Error: Matrix {} has different dimensions. "
"Expected: {}, Got: {}\n".format(
idx + 1, ref_shape, hm.matrix.matrix.shape))

# Check that sample numbers and boundaries match
if len(hm.matrix.sample_labels) != len(ref_matrix.matrix.sample_labels):
sys.exit("Error: Matrix {} has different number of samples. "
"Expected: {}, Got: {}\n".format(
idx + 1, len(ref_matrix.matrix.sample_labels),
len(hm.matrix.sample_labels)))

if hm.matrix.sample_boundaries != ref_matrix.matrix.sample_boundaries:
sys.exit("Error: Matrix {} has different sample boundaries.\n".format(idx + 1))

# Check that group boundaries match
if hm.matrix.group_boundaries != ref_matrix.matrix.group_boundaries:
sys.exit("Error: Matrix {} has different group boundaries.\n".format(idx + 1))

# Stack all matrices and compute the mean using nanmean to handle NaN values
matrix_stack = np.array([hm.matrix.matrix for hm in matrices])

# Use nanmean to ignore NaN values when computing the mean
# If all values at a position are NaN, the result will be NaN
with np.errstate(invalid='ignore'):
result = np.nanmean(matrix_stack, axis=0)

# Replace the matrix in the reference with the mean
ref_matrix.matrix.matrix = result

# Save the result
ref_matrix.save_matrix(args.outFileName)

# Print summary
print("Mean calculation completed:")
print(" Number of matrices: {}".format(len(matrices)))
print(" Input files:")
for i, fname in enumerate(args.matrixFile, start=1):
print(" {}: {}".format(i, fname))
print(" Output: {}".format(args.outFileName))
print(" Matrix dimensions: {}".format(result.shape))
print(" Number of NaN values: {}".format(np.sum(np.isnan(result))))


def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verbose=True):
"""
Iterate through the files noted by regionsFileName and sort hm accordingly
Expand Down Expand Up @@ -848,5 +1091,9 @@ def main(args=None):
elif args.command == 'relabel':
relabelMatrix(hm, args)
hm.save_matrix(args.outFileName)
elif args.command == 'arithmetic':
arithmeticMatrices(args)
elif args.command == 'mean':
meanMatrices(args)
else:
sys.exit("Unknown command {0}!\n".format(args.command))