Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rename sample ids #4

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions python_scripts/add_metadata_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python

#-----------------------------------------------------------------------------
# Copyright (c) 2016--, Evguenia Kopylova.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

"""
Add metadata columns from super metadata file to reduced metadata file.
"""

import click
from collections import OrderedDict


def add_columns(qiime_mapping_file_fp,
cgc_mapping_file_fp,
metadata_column,
output_fp):
"""Copy metadata columns from CGC metadata file to QIIME metadata file.

Parameters
----------
qiime_mapping_file_fp: str
Filepath to QIIME mapping file
cgc_mapping_file_fp: str
Filepath to CGC mapping file
metadata_column: tuple
Metadata columns to copy from CGC to QIIME mapping file
output_fp: str
Filepath to updated QIIME mapping file
"""
qiime_mapping_file = {}
qiime_mapping_header = ""
with open(qiime_mapping_file_fp) as qiime_f:
for line in qiime_f:
line = line.strip().split('\t')
if line[0] == '#SampleID':
qiime_mapping_header = line
continue
# use filename as key
key = line[3]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a bit fragile. What about using pandas to parse the table and pull out the column of interest?

import pandas as pd

table = pd.read_csv(qiime_mapping_file_fp, sep='\t', dtype=object)
table.set_index('#SampleID', inplace=True)

...that will trigger changes in the below code but I think the result will be better

qiime_mapping_file[key] = line

cgc_mapping_header = []
cgc_metadata_columns = OrderedDict()
# initialize list indexes to 0
for name in metadata_column:
cgc_metadata_columns[str(name)] = 0
files_searched = set()
with open(cgc_mapping_file_fp) as cgc_f:
for line in cgc_f:
line = line.strip().split('\t')
if line[0] == "case_name":
cgc_mapping_header = line
# set correct index values for metadata columns in CGC header
for key in cgc_metadata_columns:
cgc_metadata_columns[key] = cgc_mapping_header.index(key)+1
continue
# use filename as key
key_file = line[22]
if key_file in files_searched:
continue
if key_file in qiime_mapping_file:
for key_column, value in cgc_metadata_columns.iteritems():
qiime_mapping_file[key_file].append(line[value])
files_searched.add(key_file)

# output updated QIIME mapping file
num_columns_qiime = len(qiime_mapping_header)
num_columns_cgc = len(cgc_metadata_columns)
with open(output_fp, 'w') as output_f:
output_f.write('\t'.join(map(str,qiime_mapping_header[:-1])))
for key in cgc_metadata_columns:
output_f.write('\t%s' % key)
output_f.write('\t%s' % qiime_mapping_header[-1])
output_f.write('\n')
for key in qiime_mapping_file:
# Output QIIME mapping file up to Description column
output_f.write('\t'.join(map(str,qiime_mapping_file[key][:num_columns_qiime-1])))
output_f.write('\t')
# Output added keys from CGC mapping file
output_f.write('\t'.join(map(str,qiime_mapping_file[key][num_columns_qiime:])))
# Output Description column
output_f.write('\t%s' % qiime_mapping_file[key][num_columns_qiime-1])
output_f.write('\n')


@click.command()
@click.option('--qiime-mapping-file-fp', required=False,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='Filepath to QIIME metadata file')
@click.option('--cgc-mapping-file-fp', required=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='Filepath to CGC metadata file')
@click.option('--metadata-column', type=str, required=True, multiple=True,
help="Metadata column to add from CGC to QIIME mapping file")
@click.option('--output-fp', required=True,
type=click.Path(resolve_path=True, readable=True, exists=False,
file_okay=True),
help='Filepath to updated QIIME mapping file')
def main(qiime_mapping_file_fp,
cgc_mapping_file_fp,
metadata_column,
output_fp):
# rename sample ids in mapping file
add_columns(qiime_mapping_file_fp,
cgc_mapping_file_fp,
metadata_column,
output_fp)


if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions python_scripts/rename_sample_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

#-----------------------------------------------------------------------------
# Copyright (c) 2016--, Evguenia Kopylova.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

"""
Rename sample IDs in mapping file and BIOM table for combined analysis of
multiple disease types in CGC TCGA analysis.
"""

import click
from os.path import splitext, join
from biom.table import Table
from biom.util import biom_open
from biom import load_table


def rename_sample_ids(mapping_file_fp,
biom_fp,
count_start):
"""Rename sample IDs to join mapping files

Parameters
----------
mapping_file_fp: tuple
Filepath to mapping file
biom_fp: str
Filepath to BIOM table
count_start: int
First new sample ID name (ascending in order)
"""
output_mapping_file_fp = "%s_modified.txt" % splitext(mapping_file_fp)[0]
id_map = {}
modified_id = count_start
with open(output_mapping_file_fp, 'w') as output_f:
with open(mapping_file_fp, 'r') as mapping_f:
for line in mapping_f:
if line.startswith('#SampleID'):
output_f.write(line)
continue
line = line.strip().split('\t')
curr_sample_id = line[0]
new_sample_id = "s%s" % modified_id
id_map[curr_sample_id] = new_sample_id
line[0] = new_sample_id
output_f.write('\t'.join(map(str,line)))
output_f.write('\n')
modified_id += 1

# update IDs in BIOM table to match modified mapping file
output_biom_fp = "%s_modified.biom" % splitext(biom_fp)[0]
table = load_table(biom_fp)
table.update_ids(id_map, axis='sample')
with biom_open(output_biom_fp, 'w') as f:
table.to_hdf5(h5grp=f, generated_by="tcga-rename-sample-ids")


@click.command()
@click.option('--mapping-file-fp', required=False,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='Filepath to mapping file')
@click.option('--biom-fp', required=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='Filepath to BIOM table')
@click.option('--count-start', required=True, type=int,
help='First new sample ID name (ascending in order)')
def main(mapping_file_fp,
biom_fp,
count_start):
# rename sample ids in mapping file
rename_sample_ids(mapping_file_fp,
biom_fp,
count_start)


if __name__ == "__main__":
main()