Merge branch 'dh2-i18n-rc1' of https://github.com/cidgoh/DataHarmonizer…

… into dh2-i18n-rc1
cidgoh · Nov 12, 2024 · 9013cb9 · 9013cb9
2 parents 1628809 + d1803cb
commit 9013cb9
Show file tree

Hide file tree

Showing 72 changed files with 194,454 additions and 43,688 deletions.
diff --git a/lib/DataHarmonizer.js b/lib/DataHarmonizer.js
@@ -1586,7 +1586,7 @@ class DataHarmonizer {
         } else {
           col.type = 'key-value-list';
           if (
-            !field.sources.includes('null value menu') ||
+            !field.sources.includes('NullValueMenu') ||
             field.sources.length > 1
           ) {
             col.trimDropdown = false; // Allow expansion of pulldown past field width
@@ -1919,7 +1919,7 @@ class DataHarmonizer {
       let sources = [];
       for (const [, item] of Object.entries(field.sources)) {
         // List null value menu items directly
-        if (item === 'null value menu') {
+        if (item === 'NullValueMenu') {
           let null_values = Object.keys(
             this.schema.enums[item].permissible_values
           );
@@ -2080,11 +2080,24 @@ class DataHarmonizer {
    * @return {Dictionary<Integer>} Dictionary of all fields.
    */
   getFieldNameMap(fields) {
-    const titleMap = {};
+    const map = {};
     for (const [fieldIndex, field] of fields.entries()) {
-      titleMap[field.name] = fieldIndex;
+      map[field.name] = fieldIndex;
     }
-    return titleMap;
+    return map;
+  }
+
+  /**
+   * Get a dictionary of source field TITLES pointing to column index
+   * @param {Object} fields A flat version of data.js.
+   * @return {Dictionary<Integer>} Dictionary of all fields.
+   */
+  getFieldTitleMap(fields) {
+    const map = {};
+    for (const [fieldIndex, field] of fields.entries()) {
+      map[field.title] = fieldIndex;
+    }
+    return map;
   }
 
   /**
@@ -2230,24 +2243,114 @@ class DataHarmonizer {
 
       // if sources exist, fetch transformed Value
       if (field.sources) {
-        mappedCell.push(
-          self.getTransformedField(headerName, mappedCellVal, field, prefix)
-        );
-      } else if (field.multivalued === true) {
-        // ISSUE: relying on semicolon delimiter in input
-
-        for (let cellVal of mappedCellVal.split(';')) {
-          mappedCell.push(
-            self.getTransformedField(headerName, cellVal.trim(), field, prefix)
-          );
+        if (field.multivalued === true) { //Map list of semicolon-delimited choices
+          // ISSUE: relying on semicolon delimiter in input
+          for (let cellVal of mappedCellVal.split(';')) {
+            mappedCell.push( self.getTransformedField(headerName, cellVal.trim(), field, prefix));
+          }
         }
-      } else {
+        else { // Map single choice
+          mappedCell.push(self.getTransformedField(headerName, mappedCellVal, field, prefix));
+        }
+      }
+      else { // No mapping.
         mappedCell.push(mappedCellVal);
       }
     }
+
     return mappedCell.join(delimiter);
   }
 
+  /**
+   * Given a table row, output a value based on the following conditional:
+   * ```
+   * if (value in headerNameToCheck == valToMatch) {
+   *   return value in headerNameToOutput;
+   * } else {
+   *   return "";
+   * }
+   * ```
+   * TODO is there any need for additional complexities in getMappedField?
+   *   i.e., transforming field
+   * @param {string} headerNameToCheck Field name of user-inputted vals to check
+   * against `valToMatch`.
+   * @param {string} valToMatch Value to match user-inputted vals against during
+   * conditional.
+   * @param {string} headerNameToOutput Field name of user-inputted vals to
+   * return if conditional is satisfied.
+   * @param {string[]} inputRow Table row.
+   * @param {Object<string, number>} sourceFieldNameMap `getFieldNameMap` return
+   * val.
+   * @return {string} `valToMatch` if condition is satisfied; empty str
+   * otherwise.
+   */
+  getIfThenField(
+    headerNameToCheck,
+    valToMatch,
+    headerNameToOutput,
+    inputRow,
+    sourceFieldNameMap
+  ) {
+    const valToCheck = inputRow[sourceFieldNameMap[headerNameToCheck]];
+    const valToOutput = inputRow[sourceFieldNameMap[headerNameToOutput]];
+    return valToCheck === valToMatch ? valToOutput : '';
+  }
+
+  /**
+   * Given a table row, find the intersection of user-inputted values in
+   * `headerNameToCheck` and vals in `matchedValsSet`.
+   * @param {string} headerNameToCheck Field name of user-inputted vals to
+   * intersect against `matchedValsSet`.
+   * @param {Set<string>} matchedValsSet Set of values that user-inputted vals
+   * are intersected against.
+   * @param {string[]} inputRow Table row.
+   * @param {Object<string, number>} sourceFieldNameMap `getFieldNameMap` return
+   * val.
+   * @return {string} Intersection of user-inputted values in
+   * `headerNameToCheck` and vals in `matchedValsSet`.
+   */
+  getMatchedValsField(
+    headerNameToCheck,
+    matchedValsSet,
+    inputRow,
+    sourceFieldNameMap
+  ) {
+    const valsToCheckStr = inputRow[sourceFieldNameMap[headerNameToCheck]];
+    if (!valsToCheckStr) return '';
+
+    const valsToCheckArray = valsToCheckStr.split('; ');
+    const valsToOutputArray = valsToCheckArray.filter((e) =>
+      matchedValsSet.has(e)
+    );
+    return valsToOutputArray.join('; ');
+  }
+
+  /**
+   * Given a table row, and an ordered collection of field names, return the
+   * first non-null field val.
+   * @param {string[]} headerNamesToCheck Field names of user-inputted vals to
+   * check for non-null vals, in 0-indexed order.
+   * @param {string[]} inputRow Table row.
+   * @param {Object<string, number>} sourceFieldNameMap `getFieldNameMap` return
+   * val.
+   * @return {string} First non-null val in `headerNamesToCheck`.
+   */
+  getFirstNonNullField(headerNamesToCheck, inputRow, sourceFieldNameMap) {
+    const nullValsSet = new Set(
+      Object.keys(this.schema.enums.NullValueMenu.permissible_values).concat([
+        '',
+        null,
+      ])
+    );
+    const valsToCheck = headerNamesToCheck.map((headerName) => {
+      const valToCheck = inputRow[sourceFieldNameMap[headerName]];
+      // TODO trim because copy pasting from excel == '\r\n'; wider issue?
+      return typeof valToCheck === 'string' ? valToCheck.trim() : valToCheck;
+    });
+    const firstNonNullVal = valsToCheck.find((e) => !nullValsSet.has(e));
+    return firstNonNullVal ? firstNonNullVal : '';
+  }
+
   /**
    * Some enumeration values get mapped over to export format values.
    *
@@ -2285,51 +2388,6 @@ class DataHarmonizer {
     return value;
   }
 
-  /**
-   * Get a dictionary of empty arrays for each ExportHeader field
-   * FUTURE: enable it to work with hierarchic vocabulary lists
-   *
-   * @param {Array<String>} sourceRow array of values to be exported.
-   * @param {Array<String>} sourceFields list of source fields to examine for mappings.
-   * @param {Array<Array>} RuleDB list of export fields modified by rules.
-   * @param {Array<Array>} fields list of export fields modified by rules.
-   * @param {Array<Integer>} titleMap map of field names to column index.
-   * @param {String} prefix of export format to examine.
-   * @return {Array<Object>} fields Dictionary of all fields.
-   */
-
-  getRowMap(sourceRow, sourceFields, RuleDB, fields, titleMap, prefix) {
-    for (const title of sourceFields) {
-      const sourceIndex = titleMap[title];
-      let value = sourceRow[sourceIndex]; // All text values.
-      // Sets source field to data value so that rules can reference it easily.
-      RuleDB[title] = value;
-      // Check to see if value is in vocabulary of given select field, and if it
-      // has a mapping for export to a GRDI target field above, then set target
-      // to value.
-      if (value && value.length > 0) {
-        const vocab_list = fields[sourceIndex]['schema:ItemList'];
-        if (value in vocab_list) {
-          const term = vocab_list[value];
-          // Looking for term.exportField['GRDI'] for example:
-          if ('exportField' in term && prefix in term.exportField) {
-            for (let mapping of term.exportField[prefix]) {
-              // Here mapping involves a value substitution
-              if ('value' in mapping) {
-                value = mapping.value;
-                // Changed on a copy of data, not handsongrid table
-                sourceRow[sourceIndex] = value;
-              }
-              if ('field' in mapping && mapping['field'] in RuleDB) {
-                RuleDB[mapping['field']] = value;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
   /**
    * Return first and last items of a delimited string
    * @param {String} value A string of values separated by delimiter.

diff --git a/lib/Validator.js b/lib/Validator.js
@@ -45,14 +45,21 @@ class Validator {
       );
     }
 
-    // LinkML does not yet have support for non-numeric minimum_value and maximum_value. In the
-    // meantime, DataHarmonizer has a convention of putting these values in todos with specific
-    // prefixes.
+    /* LinkML does not yet have support for non-numeric minimum_value and 
+    maximum_value. In the meantime, DataHarmonizer has a convention of 
+    putting these values in todos with specific prefixes. If a slot has
+    any todos, process them here. 
+
+    This has to be defined in runtime since variables like {today} exist
+    in todos array.
+    */
     const processTodos = (slotDefinition, todos) => {
       if (!todos || !todos.length) {
         return;
       }
-      const slotType = this.#schema.types?.[slotDefinition.range];
+
+      const slotType = this.getSlotType(slotDefinition);
+
       if (slotType?.uri === 'xsd:date') {
         for (const todo of todos) {
           if (todo.substring(0, 2) === '>=') {
@@ -62,6 +69,7 @@ class Validator {
           }
         }
       }
+
       for (const def of slotDefinition.any_of || []) {
         processTodos(def, todos);
       }
@@ -75,13 +83,15 @@ class Validator {
         processTodos(def, todos);
       }
     };
+
     for (const slotDefinition of Object.values(this.#targetClassInducedSlots)) {
       processTodos(slotDefinition, slotDefinition.todos);
     }
 
-    // DataHarmonizer has a convention for using todos to specify that for a given row the value
-    // of one column is the min/max value of another column (e.g. ">={other slot name}"). Index
-    // info about that here.
+    /* DataHarmonizer has a convention for using todos to specify that for a 
+    given row the value of one column is the min/max value of another column
+    (e.g. ">={other slot name}"). Index info about that here.
+    */
     this.#dependantMinimumValuesMap = new Map();
     this.#dependantMaximumValuesMap = new Map();
     for (const slotDefinition of Object.values(this.#targetClassInducedSlots)) {
@@ -130,6 +140,32 @@ class Validator {
     this.#valueValidatorMap = new Map();
   }
 
+
+  /* This returns a single primitve data type for a slot - a decimal, date,
+   string etc. or possibly an enumeration.  Enumerations are handled 
+   separately however (by const slotEnum = ...). Slots either use "range"
+   attribute, OR they use any_of or exactly_one_of etc. attribute expression
+   where an array of [range: x, range: y ...] is given.  This call returns the
+   schema.types[] lookup for the FIRST range in the list in that case, which
+   may be undefined if that is a menu too.
+  */
+  getSlotType(slotDefinition) {
+
+    var slotType = this.#schema.types?.[slotDefinition.range];
+
+    if (slotType === undefined) {
+      const extended_range = ['any_of', 'all_of', 'exactly_one_of', 'none_of'];
+      for (let def of extended_range) {
+        if (def in slotDefinition) {
+          slotType = this.#schema.types?.[slotDefinition[def][0]['range']];
+          break;
+        }
+      }
+    }
+
+    return slotType
+  }
+
   getValidatorForSlot(slot, options = {}) {
     const { cacheKey, inheritedRange } = options;
     if (typeof cacheKey === 'string' && this.#valueValidatorMap.has(cacheKey)) {
@@ -147,12 +183,19 @@ class Validator {
       slotDefinition.range = inheritedRange;
     }
 
-    const slotType = this.#schema.types?.[slotDefinition.range];
+    const slotType = this.getSlotType(slotDefinition);
+
     const slotEnum = this.#schema.enums?.[slotDefinition.range];
     const slotPermissibleValues = Object.values(
       slotEnum?.permissible_values ?? {}
     ).map((pv) => pv.text);
 
+    // Issue: if any_of lists a NullValueList enumeration in 2nd range
+    // where first range is a date, we don't have a menu control but
+    // also a valid "Missing" etc isn't validated as ok.
+    // TEST CASE:
+    //  if (slotDefinition.name == "sample_collection_date")
+    //    console.log("any_of", DEBUG INFO)
     const anyOfValidators = (slotDefinition.any_of ?? []).map((subSlot) =>
       this.getValidatorForSlot(subSlot, {
         inheritedRange: slotDefinition.range,