Skip to content

Commit

Permalink
documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
bedroesb committed Jan 18, 2022
1 parent 5c48db8 commit 9f59f7f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 26 deletions.
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,17 @@ All supported arguments:
--experiment EXPERIMENT
table of EXPERIMENT object
--run RUN table of RUN object
--data [FILE ...] data for submission
--data [FILE [FILE ...]]
data for submission
--center CENTER_NAME specific to your Webin account
--checklist CHECKLIST
specify the sample checklist with following pattern: ERC0000XX, Default: ERC000011
--xlsx XLSX Excel table with metadata
--xlsx XLSX filled in excel template with metadata
--auto_action BETA: detect automatically which action (add or modify) to apply when the action column is not given
--tool TOOL_NAME specify the name of the tool this submission is done with. Default: ena-upload-cli
--tool_version TOOL_VERSION
specify the version of the tool this submission is done with
--no_data_upload indicate if no upload should be performed and you like to submit a RUN object (e.g. if uploaded
was done separately).
--no_data_upload indicate if no upload should be performed and you like to submit a RUN object (e.g. if uploaded was done separately).
--draft indicate if no submission should be performed
--secret SECRET .secret.yml file containing the password and Webin ID of your ENA account
-d, --dev flag to use the dev/sandbox endpoint of ENA
Expand Down Expand Up @@ -161,7 +162,10 @@ Use the *--dev* flag if you want to do a test submission using the tool by the s

### Submitting a selection of rows to ENA

Optionally you can add a status column to every table that contains the action you want to apply during this submission. If you chose to add only the first 2 samples to ENA, you specify `--action add` as parameter in the command and you add the `add` value to the status column of the rows you want to submit as demonstrated below. Same holds for the action `modify`, `release` and `cancel`.
There are two ways of submitting only a selection of objects to ENA. This is handy for reoccurring submissions, especially when they belong to the same study.

- Manual: you can add an optional `status` column to every table/sheet that contains the action you want to apply during this submission. If you chose to add only the first 2 samples to ENA, you specify `--action add` as parameter in the command and you add the `add` value to the status column of the rows you want to submit as demonstrated below. Same holds for the action `modify`, `release` and `cancel`.
- Automatic (BETA): using the `--auto_action` it is possible to auto detect wether an object (using the alias) is already present on ENA and will add `modify` or `add` to the table. This works only with ENA objects that are published and findable on the website trough the search function. If the tool does not correctly detect its presence on the website we suggest to use the more robust manual approach as discribed above.

**Example with modify as seen in the [example sample modify table](example_tables/ENA_template_samples_modify.tsv)**

Expand Down
51 changes: 30 additions & 21 deletions ena_upload/ena_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,15 @@ def check_columns(df, schema, action, dev, auto_action):
if auto_action:
for index, row in df.iterrows():
try:
remote_action = str(identify_action(schema, str(df['alias'][index]), dev)).upper()
print(f"\t'{df['alias'][index]}' gets '{remote_action}' as action in the status column")
remote_action = str(identify_action(
schema, str(df['alias'][index]), dev)).upper()
print(
f"\t'{df['alias'][index]}' gets '{remote_action}' as action in the status column")

except Exception as e:
print(e)
print(f"Something went wrong with detecting the ENA object {df['alias'][index]} on the servers of ENA. This object will be skipped.")
print(
f"Something went wrong with detecting the ENA object {df['alias'][index]} on the servers of ENA. This object will be skipped.")
df.at[index, header] = remote_action
else:
# status column contain action keywords
Expand All @@ -118,6 +121,7 @@ def check_columns(df, schema, action, dev, auto_action):

return df


def check_filenames(file_paths, run_df):
"""Compare data filenames from command line and from RUN table.
Expand Down Expand Up @@ -699,10 +703,10 @@ def process_args():

parser.add_argument('--checklist', help="specify the sample checklist with following pattern: ERC0000XX, Default: ERC000011", dest='checklist',
default='ERC000011')

parser.add_argument('--xlsx',
help='Excel table with metadata')
help='filled in excel template with metadata')

parser.add_argument('--auto_action',
action="store_true",
default=False,
Expand Down Expand Up @@ -747,15 +751,15 @@ def process_args():
if not os.path.isfile(args.secret):
msg = f"Oops, the file {args.secret} does not exist"
parser.error(msg)

# check if xlsx file exists
if args.xlsx:
if not os.path.isfile(args.xlsx):
msg = f"Oops, the file {args.xlsx} does not exist"
parser.error(msg)

# check if data is given when adding a 'run' table
if (not args.no_data_upload and args.run and args.action.upper() not in ['RELEASE','CANCEL']) or (not args.no_data_upload and args.xlsx and args.action.upper() not in ['RELEASE','CANCEL']):
if (not args.no_data_upload and args.run and args.action.upper() not in ['RELEASE', 'CANCEL']) or (not args.no_data_upload and args.xlsx and args.action.upper() not in ['RELEASE', 'CANCEL']):
if args.data is None:
parser.error('Oops, requires data for submitting RUN object')

Expand Down Expand Up @@ -784,6 +788,7 @@ def collect_tables(args):

return schema_tables


def update_date(date):
if pd.isnull(date) or isinstance(date, str):
return date
Expand Down Expand Up @@ -830,16 +835,19 @@ def main():
elif f"ENA_{schema}" in xl_workbook.book.sheetnames:
xl_sheet = xl_workbook.parse(f"ENA_{schema}", header=0)
else:
sys.exit(f"The sheet '{schema}' is not present in the excel sheet {xlsx}")
sys.exit(
f"The sheet '{schema}' is not present in the excel sheet {xlsx}")
xl_sheet = xl_sheet.drop(0).dropna(how='all')
for column_name in list(xl_sheet.columns.values):
if 'date' in column_name:
xl_sheet[column_name] = xl_sheet[column_name].apply(update_date)
xl_sheet[column_name] = xl_sheet[column_name].apply(
update_date)

if True in xl_sheet.columns.duplicated():
sys.exit("Duplicated columns found")

xl_sheet = check_columns(xl_sheet, schema, action, dev, auto_action)
xl_sheet = check_columns(
xl_sheet, schema, action, dev, auto_action)
schema_dataframe[schema] = xl_sheet
path = os.path.dirname(os.path.abspath(xlsx))
schema_tables[schema] = f"{path}/ENA_template_{schema}.tsv"
Expand All @@ -848,7 +856,8 @@ def main():
schema_tables = collect_tables(args)

# create dataframe from table
schema_dataframe = create_dataframe(schema_tables, action, dev, auto_action)
schema_dataframe = create_dataframe(
schema_tables, action, dev, auto_action)

# ? add a function to sanitize characters
# ? print 'validate table for specific action'
Expand All @@ -872,11 +881,11 @@ def main():
file_paths = {}
if args.data:
for path in args.data:
file_paths[os.path.basename(path)] = os.path.abspath(path)
file_paths[os.path.basename(path)] = os.path.abspath(path)
# check if file names identical between command line and table
# if not, system exits
check_filenames(file_paths, df)

# generate MD5 sum if not supplied in table
if file_paths and not check_file_checksum(df):
print("No valid checksums found, generate now...", end=" ")
Expand Down Expand Up @@ -972,20 +981,20 @@ def main():
sys.exit(receipt)

if action in ['ADD', 'MODIFY']:
if draft:
if draft:
schema_dataframe = update_table_simple(schema_dataframe,
schema_targets,
action)
schema_targets,
action)
else:
schema_dataframe = update_table(schema_dataframe,
schema_targets,
schema_update)
schema_targets,
schema_update)
# save updates in new tables
save_update(schema_tables, schema_dataframe)
elif action in ['CANCEL', 'RELEASE']:
schema_dataframe = update_table_simple(schema_dataframe,
schema_targets,
action)
schema_targets,
action)
# save updates in new tables
save_update(schema_tables, schema_dataframe)

Expand Down

0 comments on commit 9f59f7f

Please sign in to comment.