Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate EOSC services mining with IIS primary workflow #1398

Closed
Closed
18 changes: 18 additions & 0 deletions iis-schemas/src/main/avro/eu/dnetlib/iis/importer/Service.avdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@namespace("eu.dnetlib.iis.importer.schemas")

protocol IIS {

record Service {

// InformationSpace service identifier
string id;

// service name
union {null , string} name;

// service url
string url;
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
@namespace("eu.dnetlib.iis.referenceextraction.datasource.schemas")
protocol IIS{

record DocumentToDatasource {
// document identifier, foreign key: DocumentWithBasicMetadata.id ("document basic metadata" data store)
string documentId;
// identifier of the referenced datasource,
// foreign key: Datasource.id
string datasourceId;
// Find more details on `confidenceLevel` constraints in eu/dnetlib/iis/README.markdown file.
float confidenceLevel;
// text snippet surrounding the matched reference, required mostly for internal debugging and analytics
union { null , string } textsnippet = null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public class OafConstants {
public static final String REL_TYPE_RESULT_RESULT = ModelConstants.RESULT_RESULT;
public static final String REL_TYPE_RESULT_PROJECT = ModelConstants.RESULT_PROJECT;
public static final String REL_TYPE_RESULT_ORGANIZATION = ModelConstants.RESULT_ORGANIZATION;
public static final String REL_TYPE_RESULT_DATASOURCE = "resultDatasource";

public static final String SUBREL_TYPE_RELATIONSHIP = ModelConstants.RELATIONSHIP;
public static final String SUBREL_TYPE_OUTCOME = ModelConstants.OUTCOME;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public enum AlgorithmName {
document_referencedProjects,
document_referencedDatasets,
document_referencedDocuments,
document_referencedDatasources,
document_research_initiative,
document_pdb,
document_software_url,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package eu.dnetlib.iis.wf.export.actionmanager.module;

import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.conf.Configuration;

import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.iis.referenceextraction.datasource.schemas.DocumentToDatasource;
import eu.dnetlib.iis.wf.export.actionmanager.OafConstants;

/**
* {@link DocumentToDatasource} based action builder module.
*
* @author mhorst
*
*/
public class DocumentToDatasourceActionBuilderModuleFactory extends AbstractActionBuilderFactory<DocumentToDatasource, Relation> {


// ------------------------ CONSTRUCTORS --------------------------

public DocumentToDatasourceActionBuilderModuleFactory() {
super(AlgorithmName.document_referencedDatasources);
}

// ------------------------ LOGIC ---------------------------------

@Override
public ActionBuilderModule<DocumentToDatasource, Relation> instantiate(Configuration config) {
return new DocumentToDatasourceActionBuilderModule(provideTrustLevelThreshold(config));
}

// ------------------------ INNER CLASS --------------------------

class DocumentToDatasourceActionBuilderModule extends AbstractBuilderModule<DocumentToDatasource, Relation> {


// ------------------------ CONSTRUCTORS --------------------------

/**
* @param trustLevelThreshold trust level threshold or null when all records should be exported
*/
public DocumentToDatasourceActionBuilderModule(Float trustLevelThreshold) {
super(trustLevelThreshold, buildInferenceProvenance());
}

// ------------------------ LOGIC --------------------------

@Override
public List<AtomicAction<Relation>> build(DocumentToDatasource object) throws TrustLevelThresholdExceededException {
return Arrays.asList(
createAction(object.getDocumentId().toString(), object.getDatasourceId().toString(),
object.getConfidenceLevel(), OafConstants.REL_CLASS_REFERENCES),
createAction(object.getDatasourceId().toString(), object.getDocumentId().toString(),
object.getConfidenceLevel(), OafConstants.REL_CLASS_IS_REFERENCED_BY));
}

// ------------------------ PRIVATE --------------------------

/**
* Creates similarity related actions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm probably missing something, but is this code related to similarity?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, this is probably some copy/paste leftover. Fixed.

*/
private AtomicAction<Relation> createAction(String source, String target, float confidenceLevel,
String relClass) throws TrustLevelThresholdExceededException {
AtomicAction<Relation> action = new AtomicAction<>();
action.setClazz(Relation.class);

Relation relation = new Relation();
relation.setSource(source);
relation.setTarget(target);
relation.setRelType(OafConstants.REL_TYPE_RESULT_DATASOURCE);
relation.setSubRelType(OafConstants.SUBREL_TYPE_RELATIONSHIP);
relation.setRelClass(relClass);
relation.setDataInfo(buildInference(confidenceLevel));
relation.setLastupdatetimestamp(System.currentTimeMillis());

action.setPayload(relation);

return action;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
<name>input_document_to_dataset</name>
<value>$UNDEFINED$</value>
</property>
<property>
<name>input_document_to_datasource</name>
<value>$UNDEFINED$</value>
</property>
<property>
<name>input_document_to_research_initiatives</name>
<value>$UNDEFINED$</value>
Expand Down Expand Up @@ -54,7 +58,7 @@
<description>Number of reducers to be used when exporting output. Affects number of output files.
When set to 0 reducing phase is omitted, input ordering is preserved and number of output files depends on number of mappers.
Used only for subset of inputs where large number of small files is provided, namely:
input_document_to_project, input_document_to_dataset, input_document_to_document_classes, input_matched_doc_organizations.
input_document_to_project, input_document_to_dataset, input_document_to_datasource, input_document_to_document_classes, input_matched_doc_organizations.
For other inputs reducing phase is not executed.
</description>
</property>
Expand All @@ -79,6 +83,11 @@
<value>$UNDEFINED$</value>
<description>document_referencedDatasets action-set identifier of exported data</description>
</property>
<property>
<name>action_set_id_document_referencedDatasources</name>
<value>$UNDEFINED$</value>
<description>document_referencedDatasources action-set identifier of exported data</description>
</property>
<property>
<name>action_set_id_document_referencedDocuments</name>
<value>$UNDEFINED$</value>
Expand Down Expand Up @@ -304,6 +313,7 @@

<fork name="forking">
<path start="decision-exporter-document-to-dataset" />
<path start="decision-exporter-document-to-datasource" />
<path start="decision-exporter-document-to-project" />
<path start="decision-exporter-document-to-project-concepts" />
<path start="decision-exporter-document-to-researchinitiatives" />
Expand All @@ -330,6 +340,13 @@
</switch>
</decision>

<decision name="decision-exporter-document-to-datasource">
<switch>
<case to="joining">${input_document_to_datasource eq "$UNDEFINED$"}</case>
<default to="exporter-document-to-datasource" />
</switch>
</decision>

<decision name="decision-exporter-document-to-project-concepts">
<switch>
<case to="skip_exporter-document-to-project-concepts">${input_document_to_project_concepts eq "$UNDEFINED$"}</case>
Expand Down Expand Up @@ -460,6 +477,39 @@
<error to="fail" />
</action>

<action name="exporter-document-to-datasource">
<map-reduce>
<prepare>
<mkdir path="${nameNode}${output}/document_referencedDatasources" />
</prepare>
<configuration>
<property>
<name>export.action.builder.factory.classname</name>
<value>eu.dnetlib.iis.wf.export.actionmanager.module.DocumentToDatasourceActionBuilderModuleFactory</value>
</property>
<property>
<name>avro.schema.input.key</name>
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDatasource']}</value>
</property>
<property>
<name>mapreduce.input.fileinputformat.inputdir</name>
<value>${input_document_to_datasource}</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.outputdir</name>
<value>${output}/document_referencedDatasources/${action_set_id_document_referencedDatasources}</value>
</property>
<!-- reducing input with large number of small files -->
<property>
<name>mapreduce.job.reduces</name>
<value>${reduce_tasks}</value>
</property>
</configuration>
</map-reduce>
<ok to="joining" />
<error to="fail" />
</action>

<action name="exporter-document-to-project-concepts">
<map-reduce>
<prepare>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package eu.dnetlib.iis.wf.export.actionmanager.module;

import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.iis.referenceextraction.datasource.schemas.DocumentToDatasource;
import eu.dnetlib.iis.wf.export.actionmanager.OafConstants;
import eu.dnetlib.iis.wf.export.actionmanager.module.VerificationUtils.Expectations;
import org.junit.jupiter.api.Test;

import java.util.List;

import static eu.dnetlib.iis.wf.export.actionmanager.module.VerificationUtils.assertOafRel;
import static org.junit.jupiter.api.Assertions.*;

/**
* @author mhorst
*
*/
public class DocumentToDatasourceActionBuilderModuleFactoryTest extends AbstractActionBuilderModuleFactoryTest<DocumentToDatasource, Relation> {


// ----------------------- CONSTRUCTORS -------------------

public DocumentToDatasourceActionBuilderModuleFactoryTest() throws Exception {
super(DocumentToDatasourceActionBuilderModuleFactory.class, AlgorithmName.document_referencedDatasources);
}

// ----------------------- TESTS --------------------------


@Test
public void testBuildBelowThreshold() {
// given
DocumentToDatasource documentToDataset = buildDocumentToDataset("documentId", "datasourceId", 0.4f);
ActionBuilderModule<DocumentToDatasource, Relation> module = factory.instantiate(config);

// execute
assertThrows(TrustLevelThresholdExceededException.class, () -> module.build(documentToDataset));
}

@Test
public void testBuild() throws Exception {
// given
String docId = "documentId";
String datasourceId = "datasourceId";
float matchStrength = 0.9f;
ActionBuilderModule<DocumentToDatasource, Relation> module = factory.instantiate(config);

// execute
List<AtomicAction<Relation>> actions = module.build(buildDocumentToDataset(docId, datasourceId, matchStrength));

// assert
assertNotNull(actions);
assertEquals(2, actions.size());

AtomicAction<Relation> action = actions.get(0);
assertNotNull(action);
assertEquals(Relation.class, action.getClazz());
Expectations expectations = new Expectations(docId, datasourceId, matchStrength,
OafConstants.REL_TYPE_RESULT_DATASOURCE, OafConstants.SUBREL_TYPE_RELATIONSHIP,
OafConstants.REL_CLASS_REFERENCES);
assertOafRel(action.getPayload(), expectations);

// checking backward relation
action = actions.get(1);
assertNotNull(action);
assertEquals(Relation.class, action.getClazz());
expectations.setSource(datasourceId);
expectations.setTarget(docId);
expectations.setRelationClass(OafConstants.REL_CLASS_IS_REFERENCED_BY);
assertOafRel(action.getPayload(), expectations);
}

// ----------------------- PRIVATE --------------------------

private static DocumentToDatasource buildDocumentToDataset(String docId, String datasourceId,
float confidenceLevel) {
DocumentToDatasource.Builder builder = DocumentToDatasource.newBuilder();
builder.setDocumentId(docId);
builder.setDatasourceId(datasourceId);
builder.setConfidenceLevel(confidenceLevel);
return builder.build();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"documentId":"id-10","datasourceId":"dsid-1","confidenceLevel":0.8164966, "textsnippet": null}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
eu/dnetlib/iis/wf/export/actionmanager/sequencefile/sampledataproducer/input/document_to_dataset.json}
</arg>
<arg>-Odocument_to_dataset=${workingDir}/producer/document_to_dataset</arg>
<arg>-C{document_to_datasource,
eu.dnetlib.iis.referenceextraction.datasource.schemas.DocumentToDatasource,
eu/dnetlib/iis/wf/export/actionmanager/sequencefile/sampledataproducer/input/document_to_datasource.json}
</arg>
<arg>-Odocument_to_datasource=${workingDir}/producer/document_to_datasource</arg>
<arg>-C{document_to_project,
eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
eu/dnetlib/iis/wf/export/actionmanager/sequencefile/sampledataproducer/input/document_to_project.json}
Expand Down Expand Up @@ -108,6 +113,10 @@
<name>input_document_to_dataset</name>
<value>${workingDir}/producer/document_to_dataset</value>
</property>
<property>
<name>input_document_to_datasource</name>
<value>${workingDir}/producer/document_to_datasource</value>
</property>
<property>
<name>input_document_to_research_initiatives</name>
<value>${workingDir}/producer/document_to_research_initiatives</value>
Expand Down Expand Up @@ -161,6 +170,10 @@
<name>action_set_id_document_referencedDatasets</name>
<value>actionset-id</value>
</property>
<property>
<name>action_set_id_document_referencedDatasources</name>
<value>actionset-id</value>
</property>
<property>
<name>action_set_id_document_referencedDocuments</name>
<value>actionset-id</value>
Expand Down Expand Up @@ -211,6 +224,7 @@
<path start="consumer-document_covid19" />
<path start="consumer-document_similarity" />
<path start="consumer-document_to_dataset" />
<path start="consumer-document_to_datasource" />
<path start="consumer-document_to_project" />
<path start="consumer-document_to_research_initiatives" />
<path start="consumer-document_to_communities" />
Expand Down Expand Up @@ -285,6 +299,18 @@
<error to="fail" />
</action>

<action name="consumer-document_to_datasource">
<java>
<main-class>eu.dnetlib.iis.common.java.ProcessWrapper</main-class>
<arg>eu.dnetlib.iis.wf.export.actionmanager.sequencefile.TestingConsumer</arg>
<arg>-Iseqfile=${workingDir}/output/document_referencedDatasources/actionset-id</arg>
<arg>-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/sequencefile/sampledataproducer/output/document_to_datasource_1.properties,/eu/dnetlib/iis/wf/export/actionmanager/sequencefile/sampledataproducer/output/document_to_datasource_2.properties
</arg>
</java>
<ok to="joining" />
<error to="fail" />
</action>

<action name="consumer-document_to_project">
<java>
<main-class>eu.dnetlib.iis.common.java.ProcessWrapper</main-class>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
clazz.canonicalName=eu.dnetlib.dhp.schema.oaf.Relation

payload.relType=resultDatasource
payload.subRelType=relationship
payload.relClass=References
payload.source=id-10
payload.target=dsid-1

payload.dataInfo.inferred=true
payload.dataInfo.trust=0.7348
payload.dataInfo.inferenceprovenance=iis::document_referencedDatasources
payload.dataInfo.provenanceaction.classid=iis
payload.dataInfo.provenanceaction.classname=iis
payload.dataInfo.provenanceaction.schemeid=dnet:provenanceActions
payload.dataInfo.provenanceaction.schemename=dnet:provenanceActions
Loading