diff --git a/metafacture-io/src/main/java/org/metafacture/io/SruOpener.java b/metafacture-io/src/main/java/org/metafacture/io/SruOpener.java new file mode 100644 index 000000000..015a3c05d --- /dev/null +++ b/metafacture-io/src/main/java/org/metafacture/io/SruOpener.java @@ -0,0 +1,249 @@ +/* Copyright 2025 Pascal Christoph + + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.io; + +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.MetafactureException; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.xml.sax.SAXException; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringWriter; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +/** + * Opens an SRU (Search Retrieval by URL) stream and passes a reader to the receiver. Pages through the SRU. + * + * @author Pascal Christoph (dr0i) + */ +@Description( + "Opens a SRU stream and passes a reader to the receiver. The input is the base URL of the SRU service " + + "to be retrieved from. Mandatory argument is: QUERY.\n" + + "The output is an XML document holding the user defined \"maximumRecords\" as documents. If there are" + + "more documents than defined by MAXIMUM_RECORDS and there are more documents wanted (defined by " + + "\"totalRecords\") there will be consecutive XML documents output as it pages through the SRU.") +@In(String.class) +@Out(java.io.Reader.class) +@FluxCommand("open-sru") +@SuppressWarnings("checkstyle:ClassFanOutComplexity") +public final class SruOpener extends DefaultObjectPipe> { + + private static final String OPERATION = "searchRetrieve"; + private static final String RECORD_SCHEMA = "MARC21-xml"; + private static final String USER_AGENT = "metafacture-core"; + private static final String VERSION = "2.0"; + + private static final int CONNECTION_TIMEOUT = 11000; + private static final int MAXIMUM_RECORDS = 10; + private static final int START_RECORD = 1; + + private boolean stopRetrieving; + + private int maximumRecords = MAXIMUM_RECORDS; + private int startRecord = START_RECORD; + private int totalRecords = Integer.MAX_VALUE; + + private String operation = OPERATION; + private String query; + private String recordSchema = RECORD_SCHEMA; + private String userAgent = USER_AGENT; + private String version = VERSION; + + /** + * Default constructor + */ + public SruOpener() { + } + + /** + * Sets the User Agent to use. Default value: {@value USER_AGENT}. + * + * @param userAgent a user agent to be used when opening a URL + */ + public void setUserAgent(final String userAgent) { + this.userAgent = userAgent; + } + + /** + * Sets the query of the search. + * Setting a query is mandatory. + * + * @param query the query + */ + + public void setQuery(final String query) { + this.query = URLEncoder.encode(query, StandardCharsets.UTF_8); + } + + /** + * Sets total number of records to be retrieved. Default value: indefinite (as in "all") + * . + * + * @param totalRecords total number of records to be retrieved + */ + public void setTotalRecords(final int totalRecords) { + this.totalRecords = totalRecords; + } + + /** + * Sets the maximum of records returned in one lookup. Default value: {@value MAXIMUM_RECORDS}. + * The lookup is repeated as long as {@link #maximumRecords} is less than {@link #totalRecords}. + * + * @param maximumRecords maximum of records returned in one lookup + */ + public void setMaximumRecords(final int maximumRecords) { + this.maximumRecords = maximumRecords; + } + + /** + * Sets where to start when retrieving records. Default value: {@value START_RECORD}. + * + * @param startRecord where to start when retrieving records + */ + public void setStartRecord(final int startRecord) { + this.startRecord = startRecord; + } + + /** + * Sets the format of the retrieved record data. Default value: {@value RECORD_SCHEMA}. + * + * @param recordSchema the format of the data of the records + */ + public void setRecordSchema(final String recordSchema) { + this.recordSchema = recordSchema; + } + + /** + * Sets the kind of operation of the lookup. Default value: {@value OPERATION}. + * + * @param operation the kind of operation of the lookup + */ + public void setOperation(final String operation) { + this.operation = operation; + } + + /** + * Sets the version of the lookup. Default value: {@value VERSION}. + * + * @param version the version of the lookup + */ + public void setVersion(final String version) { + this.version = version; + } + + @Override + public void process(final String baseUrl) { + + final String srUrl; + if (query == null) { + throw new IllegalArgumentException("Missing mandatory parameter 'query'"); + } + else { + srUrl = baseUrl + "?query=" + query + "&operation=" + operation + "&recordSchema=" + recordSchema + "&version=" + version + "&maximumRecords=" + maximumRecords; + } + + int recordsRetrieved = 0; + int numberOfRecords = Integer.MAX_VALUE; + final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + final Transformer transformer; + try { + transformer = TransformerFactory.newInstance().newTransformer(); + } + catch (final TransformerConfigurationException e) { + throw new MetafactureException(e); + } + while (!stopRetrieving && recordsRetrieved < totalRecords && startRecord < numberOfRecords) { + + try { + final InputStream inputStreamOfURl = retrieveUrl(srUrl); + final DocumentBuilder docBuilder = factory.newDocumentBuilder(); + final Document xmldoc = docBuilder.parse(inputStreamOfURl); + + final StringWriter stringWriter = new StringWriter(); + transformer.transform(new DOMSource(xmldoc), new StreamResult(stringWriter)); + + numberOfRecords = getIntegerValueFromElement(xmldoc, "numberOfRecords", 0); + final int recordPosition = getIntegerValueFromElement(xmldoc, "recordPosition", 0); + final int nextRecordPosition = getIntegerValueFromElement(xmldoc, "nextRecordPosition", totalRecords); + + recordsRetrieved = recordsRetrieved + nextRecordPosition - recordPosition; + startRecord = nextRecordPosition; + + try (InputStream inputStream = new ByteArrayInputStream(stringWriter.toString().getBytes())) { + getReceiver().process(new InputStreamReader(inputStream)); + } + } + catch (final IOException | TransformerException | SAXException | ParserConfigurationException e) { + throw new MetafactureException(e); + } + } + + } + + private int getIntegerValueFromElement(final Document xmlDoc, final String tagName, final int fallback) { + final Node node = xmlDoc.getElementsByTagName(tagName).item(0); + if (node != null) { + return Integer.parseInt(node.getTextContent()); + } + return fallback; + } + + private InputStream retrieveUrl(final String srUrl) throws IOException { + final URL urlToOpen = + new URL(srUrl + "&startRecord=" + startRecord); + final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection(); + + connection.setConnectTimeout(CONNECTION_TIMEOUT); + if (!userAgent.isEmpty()) { + connection.setRequestProperty("User-Agent", userAgent); + } + return getInputStream(connection); + } + + private InputStream getInputStream(final HttpURLConnection connection) { + try { + return connection.getInputStream(); + } + catch (final IOException e) { + stopRetrieving = true; + return connection.getErrorStream(); + } + } + +} diff --git a/metafacture-io/src/main/resources/flux-commands.properties b/metafacture-io/src/main/resources/flux-commands.properties index 39540d47e..e69c03c40 100644 --- a/metafacture-io/src/main/resources/flux-commands.properties +++ b/metafacture-io/src/main/resources/flux-commands.properties @@ -22,3 +22,4 @@ write org.metafacture.io.ObjectWriter as-records org.metafacture.io.RecordReader open-resource org.metafacture.io.ResourceOpener open-tar org.metafacture.io.TarReader +open-sru org.metafacture.io.SruOpener diff --git a/metafacture-io/src/test/java/org/metafacture/io/SruOpenerTest.java b/metafacture-io/src/test/java/org/metafacture/io/SruOpenerTest.java new file mode 100644 index 000000000..445734b54 --- /dev/null +++ b/metafacture-io/src/test/java/org/metafacture/io/SruOpenerTest.java @@ -0,0 +1,120 @@ +/* + * Copyright 2025 hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.io; + +import com.github.tomakehurst.wiremock.core.WireMockConfiguration; +import com.github.tomakehurst.wiremock.junit.WireMockRule; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.helpers.DefaultObjectPipe; +import org.mockito.Mock; +import org.mockito.junit.MockitoJUnit; +import org.mockito.junit.MockitoRule; + +import java.io.IOException; +import java.io.Reader; + + +public final class SruOpenerTest { + + private static StringBuilder resultCollector = new StringBuilder(); + private static final String RESPONSE_BODY = "response bödy"; // UTF-8 + private static final String TEST_URL = "/test/path"; + private static SruOpener sruOpener = new SruOpener(); + + + @Rule + public MockitoRule mockitoRule = MockitoJUnit.rule(); + + @Rule + public WireMockRule wireMockRule = new WireMockRule(WireMockConfiguration.wireMockConfig(). + jettyAcceptors(Runtime.getRuntime() + .availableProcessors()) + .dynamicPort()); + + @Mock + private ObjectReceiver receiver; + + public SruOpenerTest() { + } + + @Before + public void setUp() { + sruOpener = new SruOpener(); + final char[] buffer = new char[1024 * 1024 * 16]; + sruOpener.setReceiver(new DefaultObjectPipe>() { + @Override + public void process(final Reader reader) { + int size; + try { + while ((size = reader.read(buffer)) != -1) { + resultCollector.append(buffer, 0, size); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + } + + + @Test + public void test_(){ + + // sruOpener.setQuery("dnb.isil%3DDE-Sol1"); + sruOpener.setQuery("WVN=24A05"); + sruOpener.setRecordSchema("MARC21plus-xml"); + sruOpener.setVersion("1.1"); + sruOpener.setStartRecord(1890); + sruOpener.setMaximumRecords(1); + sruOpener.setTotalRecords(3); + sruOpener.process("https://services.dnb.de/sru/dnb"); + System.out.println(resultCollector.toString()); + } + +/* @Test + public void shouldPerformGetRequestWithInputAsUrlByDefault() throws IOException { + SruOpener sruOpener = new SruOpener(); + sruOpener.setQuery("WVN%3D24A05"); + sruOpener.setRecordSchema("MARC21plus-xml"); + sruOpener.setVersion("1.1"); + sruOpener.setStartRecord("1890"); + sruOpener.setTotal("32"); + shouldPerformRequest(TEST_URL,sruOpener); + }*/ + + @Test + public void test() { + SruOpener sruOpener = new SruOpener(); + RecordReader recordReader = new RecordReader(); + recordReader.setReceiver(new ObjectStdoutWriter()); + sruOpener.setReceiver(recordReader);// { + sruOpener.setQuery("dnb.isil=DE-Sol1"); + // sruOpener.setQuery("WVN%3D24A05"); + sruOpener.setRecordSchema("MARC21plus-xml"); + sruOpener.setVersion("1.1"); + sruOpener.setStartRecord(3029); + sruOpener.setMaximumRecords(1); + sruOpener.setTotalRecords(1); + // sruOpener.process("https://services.dnb.de/sru/dnb"); + sruOpener.process("https://services.dnb.de/sru/zdb"); + // sruOpener.process("https://amsquery.stadt-zuerich.ch/sru/"); + // System.out.println(resultCollector.toString()); + } +}