Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate EOSC services mining with IIS primary workflow #1398

Closed
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,19 @@ public final class WorkflowRuntimeParameters {

private WorkflowRuntimeParameters() {}

/**
* Returns value provided at input if not blank and different than
* {@link WorkflowRuntimeParameters#UNDEFINED_NONEMPTY_VALUE}, returns null otherwise.
*/
public static String getValueOrNullIfNotValid(String value) {
return (StringUtils.isNotBlank(value) && !UNDEFINED_NONEMPTY_VALUE.equals(value) ? value : null);
}

/**
* Retrieves parameter from hadoop context configuration when set to value different than {@link WorkflowRuntimeParameters#UNDEFINED_NONEMPTY_VALUE}.
*/
public static String getParamValue(String paramName, Configuration configuration) {
String paramValue = configuration.get(paramName);
if (StringUtils.isNotBlank(paramValue) && !UNDEFINED_NONEMPTY_VALUE.equals(paramValue)) {
return paramValue;
} else {
return null;
}
return getValueOrNullIfNotValid(configuration.get(paramName));
}

/**
Expand Down Expand Up @@ -79,12 +82,6 @@ public static String getParamValue(String paramName, String defaultValue, Map<St
* @param parameters map of parameters
*/
public static String getParamValueWithUndefinedCheck(String paramName, String defaultValue, Map<String, String> parameters) {

String paramValue = parameters.get(paramName);
if (StringUtils.isNotBlank(paramValue) && !UNDEFINED_NONEMPTY_VALUE.equals(paramValue)) {
return paramValue;
} else {
return defaultValue;
}
return getValueOrNullIfNotValid(parameters.get(paramName));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,23 @@ public void init() {
configuration = new Configuration();
}

@Test
public void testGetValueOrNullIfNotValidWithValid() throws Exception {
// given
String value = "some value";

// assert
assertEquals(value, WorkflowRuntimeParameters.getValueOrNullIfNotValid(value));
}

@Test
public void testGetValueOrNullIfNotValidWithInvalid() throws Exception {
// assert
assertNull(WorkflowRuntimeParameters.getValueOrNullIfNotValid(" "));
assertNull(WorkflowRuntimeParameters.getValueOrNullIfNotValid(""));
assertNull(WorkflowRuntimeParameters.getValueOrNullIfNotValid(null));
assertNull(WorkflowRuntimeParameters.getValueOrNullIfNotValid(WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE));
}

@Test
public void testGetParamValue() throws Exception {
Expand Down
18 changes: 18 additions & 0 deletions iis-schemas/src/main/avro/eu/dnetlib/iis/importer/Service.avdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@namespace("eu.dnetlib.iis.importer.schemas")

protocol IIS {

record Service {

// InformationSpace service identifier
string id;

// service name
union {null , string} name;

// service url
string url;
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
@namespace("eu.dnetlib.iis.referenceextraction.datasource.schemas")
protocol IIS{

record DocumentToDatasource {
// document identifier, foreign key: DocumentWithBasicMetadata.id ("document basic metadata" data store)
string documentId;
// identifier of the referenced datasource,
// foreign key: Datasource.id
string datasourceId;
// Find more details on `confidenceLevel` constraints in eu/dnetlib/iis/README.markdown file.
float confidenceLevel;
// text snippet surrounding the matched reference, required mostly for internal debugging and analytics
union { null , string } textsnippet = null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public class OafConstants {
public static final String REL_TYPE_RESULT_RESULT = ModelConstants.RESULT_RESULT;
public static final String REL_TYPE_RESULT_PROJECT = ModelConstants.RESULT_PROJECT;
public static final String REL_TYPE_RESULT_ORGANIZATION = ModelConstants.RESULT_ORGANIZATION;
public static final String REL_TYPE_RESULT_DATASOURCE = "resultDatasource";

public static final String SUBREL_TYPE_RELATIONSHIP = ModelConstants.RELATIONSHIP;
public static final String SUBREL_TYPE_OUTCOME = ModelConstants.OUTCOME;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public enum AlgorithmName {
document_referencedProjects,
document_referencedDatasets,
document_referencedDocuments,
document_referencedDatasources,
document_research_initiative,
document_pdb,
document_software_url,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package eu.dnetlib.iis.wf.export.actionmanager.module;

import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.conf.Configuration;

import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.iis.referenceextraction.datasource.schemas.DocumentToDatasource;
import eu.dnetlib.iis.wf.export.actionmanager.OafConstants;

/**
* {@link DocumentToDatasource} based action builder module.
*
* @author mhorst
*
*/
public class DocumentToDatasourceActionBuilderModuleFactory extends AbstractActionBuilderFactory<DocumentToDatasource, Relation> {


// ------------------------ CONSTRUCTORS --------------------------

public DocumentToDatasourceActionBuilderModuleFactory() {
super(AlgorithmName.document_referencedDatasources);
}

// ------------------------ LOGIC ---------------------------------

@Override
public ActionBuilderModule<DocumentToDatasource, Relation> instantiate(Configuration config) {
return new DocumentToDatasourceActionBuilderModule(provideTrustLevelThreshold(config));
}

// ------------------------ INNER CLASS --------------------------

class DocumentToDatasourceActionBuilderModule extends AbstractBuilderModule<DocumentToDatasource, Relation> {


// ------------------------ CONSTRUCTORS --------------------------

/**
* @param trustLevelThreshold trust level threshold or null when all records should be exported
*/
public DocumentToDatasourceActionBuilderModule(Float trustLevelThreshold) {
super(trustLevelThreshold, buildInferenceProvenance());
}

// ------------------------ LOGIC --------------------------

@Override
public List<AtomicAction<Relation>> build(DocumentToDatasource object) throws TrustLevelThresholdExceededException {
return Arrays.asList(
createAction(object.getDocumentId().toString(), object.getDatasourceId().toString(),
object.getConfidenceLevel(), OafConstants.REL_CLASS_REFERENCES),
createAction(object.getDatasourceId().toString(), object.getDocumentId().toString(),
object.getConfidenceLevel(), OafConstants.REL_CLASS_IS_REFERENCED_BY));
}

// ------------------------ PRIVATE --------------------------

/**
* Creates result-datasource relationship actions.
*/
private AtomicAction<Relation> createAction(String source, String target, float confidenceLevel,
String relClass) throws TrustLevelThresholdExceededException {
AtomicAction<Relation> action = new AtomicAction<>();
action.setClazz(Relation.class);

Relation relation = new Relation();
relation.setSource(source);
relation.setTarget(target);
relation.setRelType(OafConstants.REL_TYPE_RESULT_DATASOURCE);
relation.setSubRelType(OafConstants.SUBREL_TYPE_RELATIONSHIP);
relation.setRelClass(relClass);
relation.setDataInfo(buildInference(confidenceLevel));
relation.setLastupdatetimestamp(System.currentTimeMillis());

action.setPayload(relation);

return action;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
<name>input_document_to_dataset</name>
<value>$UNDEFINED$</value>
</property>
<property>
<name>input_document_to_datasource</name>
<value>$UNDEFINED$</value>
</property>
<property>
<name>input_document_to_research_initiatives</name>
<value>$UNDEFINED$</value>
Expand Down Expand Up @@ -54,7 +58,7 @@
<description>Number of reducers to be used when exporting output. Affects number of output files.
When set to 0 reducing phase is omitted, input ordering is preserved and number of output files depends on number of mappers.
Used only for subset of inputs where large number of small files is provided, namely:
input_document_to_project, input_document_to_dataset, input_document_to_document_classes, input_matched_doc_organizations.
input_document_to_project, input_document_to_dataset, input_document_to_datasource, input_document_to_document_classes, input_matched_doc_organizations.
For other inputs reducing phase is not executed.
</description>
</property>
Expand All @@ -79,6 +83,11 @@
<value>$UNDEFINED$</value>
<description>document_referencedDatasets action-set identifier of exported data</description>
</property>
<property>
<name>action_set_id_document_referencedDatasources</name>
<value>$UNDEFINED$</value>
<description>document_referencedDatasources action-set identifier of exported data</description>
</property>
<property>
<name>action_set_id_document_referencedDocuments</name>
<value>$UNDEFINED$</value>
Expand Down Expand Up @@ -304,6 +313,7 @@

<fork name="forking">
<path start="decision-exporter-document-to-dataset" />
<path start="decision-exporter-document-to-datasource" />
<path start="decision-exporter-document-to-project" />
<path start="decision-exporter-document-to-project-concepts" />
<path start="decision-exporter-document-to-researchinitiatives" />
Expand All @@ -330,6 +340,13 @@
</switch>
</decision>

<decision name="decision-exporter-document-to-datasource">
<switch>
<case to="joining">${input_document_to_datasource eq "$UNDEFINED$"}</case>
<default to="exporter-document-to-datasource" />
</switch>
</decision>

<decision name="decision-exporter-document-to-project-concepts">
<switch>
<case to="skip_exporter-document-to-project-concepts">${input_document_to_project_concepts eq "$UNDEFINED$"}</case>
Expand Down Expand Up @@ -460,6 +477,39 @@
<error to="fail" />
</action>

<action name="exporter-document-to-datasource">
<map-reduce>
<prepare>
<mkdir path="${nameNode}${output}/document_referencedDatasources" />
</prepare>
<configuration>
<property>
<name>export.action.builder.factory.classname</name>
<value>eu.dnetlib.iis.wf.export.actionmanager.module.DocumentToDatasourceActionBuilderModuleFactory</value>
</property>
<property>
<name>avro.schema.input.key</name>
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDatasource']}</value>
</property>
<property>
<name>mapreduce.input.fileinputformat.inputdir</name>
<value>${input_document_to_datasource}</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.outputdir</name>
<value>${output}/document_referencedDatasources/${action_set_id_document_referencedDatasources}</value>
</property>
<!-- reducing input with large number of small files -->
<property>
<name>mapreduce.job.reduces</name>
<value>${reduce_tasks}</value>
</property>
</configuration>
</map-reduce>
<ok to="joining" />
<error to="fail" />
</action>

<action name="exporter-document-to-project-concepts">
<map-reduce>
<prepare>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package eu.dnetlib.iis.wf.export.actionmanager.module;

import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.iis.referenceextraction.datasource.schemas.DocumentToDatasource;
import eu.dnetlib.iis.wf.export.actionmanager.OafConstants;
import eu.dnetlib.iis.wf.export.actionmanager.module.VerificationUtils.Expectations;
import org.junit.jupiter.api.Test;

import java.util.List;

import static eu.dnetlib.iis.wf.export.actionmanager.module.VerificationUtils.assertOafRel;
import static org.junit.jupiter.api.Assertions.*;

/**
* @author mhorst
*
*/
public class DocumentToDatasourceActionBuilderModuleFactoryTest extends AbstractActionBuilderModuleFactoryTest<DocumentToDatasource, Relation> {


// ----------------------- CONSTRUCTORS -------------------

public DocumentToDatasourceActionBuilderModuleFactoryTest() throws Exception {
super(DocumentToDatasourceActionBuilderModuleFactory.class, AlgorithmName.document_referencedDatasources);
}

// ----------------------- TESTS --------------------------


@Test
public void testBuildBelowThreshold() {
// given
DocumentToDatasource documentToDataset = buildDocumentToDataset("documentId", "datasourceId", 0.4f);
ActionBuilderModule<DocumentToDatasource, Relation> module = factory.instantiate(config);

// execute
assertThrows(TrustLevelThresholdExceededException.class, () -> module.build(documentToDataset));
}

@Test
public void testBuild() throws Exception {
// given
String docId = "documentId";
String datasourceId = "datasourceId";
float matchStrength = 0.9f;
ActionBuilderModule<DocumentToDatasource, Relation> module = factory.instantiate(config);

// execute
List<AtomicAction<Relation>> actions = module.build(buildDocumentToDataset(docId, datasourceId, matchStrength));

// assert
assertNotNull(actions);
assertEquals(2, actions.size());

AtomicAction<Relation> action = actions.get(0);
assertNotNull(action);
assertEquals(Relation.class, action.getClazz());
Expectations expectations = new Expectations(docId, datasourceId, matchStrength,
OafConstants.REL_TYPE_RESULT_DATASOURCE, OafConstants.SUBREL_TYPE_RELATIONSHIP,
OafConstants.REL_CLASS_REFERENCES);
assertOafRel(action.getPayload(), expectations);

// checking backward relation
action = actions.get(1);
assertNotNull(action);
assertEquals(Relation.class, action.getClazz());
expectations.setSource(datasourceId);
expectations.setTarget(docId);
expectations.setRelationClass(OafConstants.REL_CLASS_IS_REFERENCED_BY);
assertOafRel(action.getPayload(), expectations);
}

// ----------------------- PRIVATE --------------------------

private static DocumentToDatasource buildDocumentToDataset(String docId, String datasourceId,
float confidenceLevel) {
DocumentToDatasource.Builder builder = DocumentToDatasource.newBuilder();
builder.setDocumentId(docId);
builder.setDatasourceId(datasourceId);
builder.setConfidenceLevel(confidenceLevel);
return builder.build();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"documentId":"id-10","datasourceId":"dsid-1","confidenceLevel":0.8164966, "textsnippet": null}
Loading