Resource Allocation Insertion into DB

Gossty · Gossty · commit 470d24658066 · 2024-06-05T11:11:55.000-07:00
- Added functionality of updating resource allocation data into db in qiita_db/util.py
- Added tests for the added functionality in qiita_db/test/test_util.py
- Moved MaxRSS_helper function from qiita_core/util.py to qiita_db/util.py.
- Moved MaxRSS_helper test from qiita_core/tests/test_util.py to qiita_db/test/test_util.py
diff --git a/notebooks/resource-allocation/generate-allocation-summary-arrays.py b/notebooks/resource-allocation/generate-allocation-summary-arrays.py
@@ -1,4 +1,4 @@
-from qiita_core.util import MaxRSS_helper
+from qiita_db.util import MaxRSS_helper
 from qiita_db.software import Software
 import datetime
 from io import StringIO
diff --git a/notebooks/resource-allocation/generate-allocation-summary.py b/notebooks/resource-allocation/generate-allocation-summary.py
@@ -5,7 +5,7 @@
 from json import loads
 from os.path import join
 
-from qiita_core.util import MaxRSS_helper
+from qiita_db.util import MaxRSS_helper
 from qiita_db.exceptions import QiitaDBUnknownIDError
 from qiita_db.processing_job import ProcessingJob
 from qiita_db.software import Software
diff --git a/qiita_core/tests/test_util.py b/qiita_core/tests/test_util.py
@@ -10,7 +10,7 @@
 
 from qiita_core.util import (
     qiita_test_checker, execute_as_transaction, get_qiita_version,
-    is_test_environment, get_release_info, MaxRSS_helper)
+    is_test_environment, get_release_info)
 from qiita_db.meta_util import (
     generate_biom_and_metadata_release, generate_plugin_releases)
 import qiita_db as qdb
@@ -82,20 +82,6 @@ def test_get_release_info(self):
         self.assertEqual(biom_metadata_release, ('', '', ''))
         self.assertNotEqual(archive_release, ('', '', ''))
 
-    def test_MaxRSS_helper(self):
-        tests = [
-            ('6', 6.0),
-            ('6K', 6000),
-            ('6M', 6000000),
-            ('6G', 6000000000),
-            ('6.9', 6.9),
-            ('6.9K', 6900),
-            ('6.9M', 6900000),
-            ('6.9G', 6900000000),
-        ]
-        for x, y in tests:
-            self.assertEqual(MaxRSS_helper(x), y)
-
 
 if __name__ == '__main__':
     main()
diff --git a/qiita_core/util.py b/qiita_core/util.py
@@ -151,15 +151,3 @@ def get_release_info(study_status='public'):
     archive_release = ((md5sum, filepath, timestamp))
 
     return (biom_metadata_release, archive_release)
-
-
-def MaxRSS_helper(x):
-    if x[-1] == 'K':
-        y = float(x[:-1]) * 1000
-    elif x[-1] == 'M':
-        y = float(x[:-1]) * 1000000
-    elif x[-1] == 'G':
-        y = float(x[:-1]) * 1000000000
-    else:
-        y = float(x)
-    return y
diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py
@@ -1368,6 +1368,79 @@ def test_minimize_const(self):
                                                    doesn't match""")
         self.assertEqual(failures, 1, "Number of failures must be 1")
 
+    def test_MaxRSS_helper(self):
+        tests = [
+            ('6', 6.0),
+            ('6K', 6000),
+            ('6M', 6000000),
+            ('6G', 6000000000),
+            ('6.9', 6.9),
+            ('6.9K', 6900),
+            ('6.9M', 6900000),
+            ('6.9G', 6900000000),
+        ]
+        for x, y in tests:
+            self.assertEqual(qdb.util.MaxRSS_helper(x), y)
+
+    def test_db_update(self):
+
+        def read_slurm(id):
+            sacct_dummies = {
+                1005932: (
+                    "JobID|ElapsedRaw|MaxRSS|Submit|Start|MaxRSS|CPUTimeRAW|"
+                    "ReqMem|AllocCPUS|AveVMSize|\n"
+                    "1005932|165||2023-02-23T14:55:07|2023-02-23T14:55:08|"
+                    "|165|120Gn|1||\n"
+                    "1005932.batch|165|328716K|2023-02-23T14:55:08"
+                    "|2023-02-23T14:55:08|328716K|"
+                    "165|120Gn|1|156284K|\n"
+                    "1005932.extern|165|0|2023-02-23T14:55:08|"
+                    "2023-02-23T14:55:08|0|165|120Gn|1|108052K|"
+                ),
+                1001100: (
+                    "JobID|ElapsedRaw|MaxRSS|Submit|Start|MaxRSS|"
+                    "CPUTimeRAW|ReqMem|AllocCPUS|AveVMSize|\n"
+                    "1001100|219||2023-02-22T11:05:26|"
+                    "2023-02-22T11:05:27||219|120Gn|1||\n"
+                    "1001100.batch|219|342204K|2023-02-22T11:05:27|"
+                    "2023-02-22T11:05:27|342204K|219|120Gn|1|156284K|\n"
+                    "1001100.extern|219|0|2023-02-22T11:05:27|"
+                    "2023-02-22T11:05:27|0|219|120Gn|1|108052K|"
+                )
+            }
+
+            slurm_info = sacct_dummies.get(id)
+            return slurm_info
+
+        types = {
+            'Split libraries FASTQ': [
+                '6d368e16-2242-4cf8-87b4-a5dc40bb890b',
+                '4c7115e8-4c8e-424c-bf25-96c292ca1931',
+                'b72369f9-a886-4193-8d3d-f7b504168e75',
+                '46b76f74-e100-47aa-9bf2-c0208bcea52d',
+                '6ad4d590-4fa3-44d3-9a8f-ddbb472b1b5f'],
+            'Pick closed-reference OTUs': [
+                '3c9991ab-6c14-4368-a48c-841e8837a79c',
+                '80bf25f3-5f1d-4e10-9369-315e4244f6d5',
+                '9ba5ae7a-41e1-4202-b396-0259aeaac366',
+                'e5609746-a985-41a1-babf-6b3ebe9eb5a9',
+            ],
+            'Single Rarefaction': [
+                '8a7a8461-e8a1-4b4e-a428-1bc2f4d3ebd0'
+            ]
+        }
+
+        qdb.util.update_resource_allocation_table(test=read_slurm)
+
+        for curr_cname, ids in types.items():
+            updated_df = qdb.util._retrieve_resource_data(
+                    curr_cname, self.SNAME, self.columns)
+            updated_ids_set = set(updated_df['processing_job_id'])
+            previous_ids_set = set(self.df['processing_job_id'])
+            for id in ids:
+                self.assertTrue(id in updated_ids_set)
+                self.assertFalse(id in previous_ids_set)
+
 
 STUDY_INFO = {
     'study_id': 1,
diff --git a/qiita_db/util.py b/qiita_db/util.py
@@ -68,15 +68,20 @@
 from errno import EEXIST
 from qiita_core.exceptions import IncompetentQiitaDeveloperError
 from qiita_core.qiita_settings import qiita_config
+from subprocess import check_output
 import qiita_db as qdb
 
+
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 
 from datetime import timedelta
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from io import StringIO
+from json import loads
+from random import choice
 from scipy.optimize import minimize
 
 # memory constant functions defined for @resource_allocation_plot
@@ -2673,3 +2678,216 @@ def _resource_allocation_failures(df, k, a, b, model, col_name, type_):
     df[f'c{type_}'] = model(x_plot, k, a, b)
     failures_df = df[df[type_] > df[f'c{type_}']]
     return failures_df
+
+
+def MaxRSS_helper(x):
+    if x[-1] == 'K':
+        y = float(x[:-1]) * 1000
+    elif x[-1] == 'M':
+        y = float(x[:-1]) * 1000000
+    elif x[-1] == 'G':
+        y = float(x[:-1]) * 1000000000
+    else:
+        y = float(x)
+    return y
+
+
+def update_resource_allocation_table(test=None):
+    # Thu, Apr 27, 2023 old allocations (from barnacle) were changed to a
+    # better allocation so using job 1265533 as the before/after so we only
+    # use the latests for the newest version
+    df = pd.DataFrame()
+
+    sql_command = """
+            SELECT
+                pj.processing_job_id AS processing_job_id
+            FROM
+                qiita.software_command sc
+            JOIN
+                qiita.processing_job pj ON pj.command_id = sc.command_id
+            JOIN
+                qiita.processing_job_status pjs
+                ON pj.processing_job_status_id = pjs.processing_job_status_id
+            LEFT JOIN
+                qiita.slurm_resource_allocations sra
+                ON pj.processing_job_id = sra.processing_job_id
+            WHERE
+                pjs.processing_job_status = 'success'
+            AND
+                pj.external_job_id ~ '^[0-9]+$'
+            AND
+                pj.external_job_id::INT >= 1265533
+            AND
+                sra.processing_job_id IS NULL;
+        """ if test is None else """
+        SELECT
+            pj.processing_job_id AS processing_job_id
+        FROM
+            qiita.software_command sc
+        JOIN
+            qiita.processing_job pj ON pj.command_id = sc.command_id
+        JOIN
+            qiita.processing_job_status pjs
+            ON pj.processing_job_status_id = pjs.processing_job_status_id
+        LEFT JOIN
+            qiita.slurm_resource_allocations sra
+            ON pj.processing_job_id = sra.processing_job_id
+        WHERE
+            pjs.processing_job_status = 'success'
+        AND
+            sra.processing_job_id IS NULL;
+    """
+
+    with qdb.sql_connection.TRN:
+        sql = sql_command
+        qdb.sql_connection.TRN.add(sql)
+        res = qdb.sql_connection.TRN.execute_fetchindex()
+        columns = ["processing_job_id"]
+        df = pd.DataFrame(res, columns=columns)
+
+    data = []
+    sacct = ['sacct', '-p', '--format=JobID,ElapsedRaw,MaxRSS,Submit,Start,'
+             'CPUTimeRAW,ReqMem,AllocCPUs,AveVMSize', '-j']
+
+    for index, row in df.iterrows():
+        job = qdb.processing_job.ProcessingJob(row['processing_job_id'])
+        extra_info = ''
+        eid = job.external_id
+        if test is not None:
+            eid = choice([1005932, 1001100])
+            rvals = test(eid)
+        else:
+            rvals = StringIO(check_output(sacct + [eid]).decode('ascii'))
+            try:
+                rvals = StringIO(check_output(sacct + [eid]).decode('ascii'))
+            except TypeError as e:
+                raise e
+
+        _d = pd.read_csv(StringIO(rvals), sep='|')
+
+        _d['processing_job_id'] = job.id
+        _d['external_id'] = eid
+
+        cmd = job.command
+        s = job.command.software
+        try:
+            samples, columns, input_size = job.shape
+        except qdb.exceptions.QiitaDBUnknownIDError:
+            # this will be raised if the study or the analysis has been
+            # deleted; in other words, the processing_job was ran but the
+            # details about it were erased when the user deleted them -
+            # however, we keep the job for the record
+            continue
+        except TypeError as e:
+            # similar to the except above, exept that for these 2 commands, we
+            # have the study_id as None
+            if cmd.name in {'create_sample_template', 'delete_sample_template',
+                            'list_remote_files'}:
+                continue
+            else:
+                raise e
+        sname = s.name
+
+        if cmd.name == 'release_validators':
+            ej = qdb.processing_job.ProcessingJob(job.parameters.values['job'])
+            extra_info = ej.command.name
+            samples, columns, input_size = ej.shape
+        elif cmd.name == 'complete_job':
+            artifacts = loads(job.parameters.values['payload'])['artifacts']
+            if artifacts is not None:
+                extra_info = ','.join({
+                    x['artifact_type'] for x in artifacts.values()
+                    if 'artifact_type' in x})
+        elif cmd.name == 'Validate':
+            input_size = sum([len(x) for x in loads(
+                job.parameters.values['files']).values()])
+            sname = f"{sname} - {job.parameters.values['artifact_type']}"
+        elif cmd.name == 'Alpha rarefaction curves [alpha_rarefaction]':
+            extra_info = job.parameters.values[
+                ('The number of rarefaction depths to include between '
+                 'min_depth and max_depth. (steps)')]
+
+        # In slurm, each JobID is represented by 3 rows in the dataframe:
+        # - external_id:  overall container for the job and its associated
+        #                   requests. When the Timelimit is hit, the container
+        #                   would take care of completing/stopping the
+        #                   external_id.batch job.
+        # - external_id.batch: it's a container job, it provides how
+        #                        much memory it uses and cpus allocated, etc.
+        # - external_id.extern: takes into account anything that happens
+        #                       outside processing but yet is included in
+        #                       the container resources. As in, if you ssh
+        #                       to the node and do something additional or run
+        #                       a prolog script, that processing would be under
+        #                       external_id but separate from external_id.batch
+        # Here we are going to merge all this info into a single row + some
+        # other columns
+
+        def merge_rows(rows):
+            date_fmt = '%Y-%m-%dT%H:%M:%S'
+            wait_time = (
+                datetime.strptime(rows.iloc[0]['Start'], date_fmt)
+                - datetime.strptime(rows.iloc[0]['Submit'], date_fmt))
+            tmp = rows.iloc[1].copy()
+            tmp['WaitTime'] = wait_time
+            return tmp
+
+        curr = _d.groupby(
+                'external_id').apply(merge_rows).reset_index(drop=True)
+
+        row_dict = {
+            'processing_job_id': job.id,
+            'external_id': eid,
+            'sId': s.id,
+            'sName': sname,
+            'sVersion': s.version,
+            'cId': cmd.id,
+            'cName': cmd.name,
+            'samples': samples,
+            'columns': columns,
+            'input_size': input_size,
+            'extra_info': extra_info,
+            'ElapsedRaw': curr['ElapsedRaw'].iloc[0],
+            'MaxRSS': curr['MaxRSS'].iloc[0],
+            'Submit': curr['Submit'].iloc[0],
+            'Start': curr['Start'].iloc[0],
+            'WaitTime': curr['WaitTime'].iloc[0],
+            'CPUTimeRAW': curr['CPUTimeRAW'].iloc[0],
+            'ReqMem': curr['ReqMem'].iloc[0],
+            'AllocCPUS': curr['AllocCPUS'].iloc[0],
+            'AveVMSize': curr['AveVMSize'].iloc[0]
+        }
+
+        data.append(row_dict)
+    df = pd.DataFrame(data)
+
+    # This is important as we are transforming the MaxRSS to raw value
+    # so we need to confirm that there is no other suffixes
+    print('Make sure that only 0/K/M exist', set(
+        df.MaxRSS.apply(lambda x: str(x)[-1])))
+
+    # Generating new columns
+    df['MaxRSSRaw'] = df.MaxRSS.apply(lambda x: MaxRSS_helper(str(x)))
+    df['ElapsedRawTime'] = df.ElapsedRaw.apply(
+        lambda x: timedelta(seconds=float(x)))
+
+    for index, row in df.iterrows():
+        with qdb.sql_connection.TRN:
+            sql = """
+                INSERT INTO qiita.slurm_resource_allocations (
+                    processing_job_id,
+                    samples,
+                    columns,
+                    input_size,
+                    extra_info,
+                    memory_used,
+                    walltime_used
+                )
+                VALUES (%s, %s, %s, %s, %s, %s, %s)
+            """
+            to_insert = [
+                row['processing_job_id'], row['samples'], row['columns'],
+                row['input_size'], row['extra_info'], row['MaxRSSRaw'],
+                row['ElapsedRaw']]
+            qdb.sql_connection.TRN.add(sql, sql_args=to_insert)
+            qdb.sql_connection.TRN.execute()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from qiita_core.util import MaxRSS_helper`
	`1`	`+from qiita_db.util import MaxRSS_helper`
`2`	`2`	`from qiita_db.software import Software`
`3`	`3`	`import datetime`
`4`	`4`	`from io import StringIO`