diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index 285e6ae32..f501b04e8 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -18,8 +18,8 @@ import static org.codelibs.core.stream.StreamUtil.stream; import java.io.BufferedInputStream; -import java.net.MalformedURLException; -import java.net.URL; +import java.net.URI; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -410,12 +410,12 @@ protected boolean isValidUrl(final String urlStr) { value = urlStr; } try { - final URL url = new java.net.URL(value); - final String host = url.getHost(); + final URI uri = URI.create(value); + final String host = uri.getHost(); if (StringUtil.isBlank(host) || "http".equalsIgnoreCase(host) || "https".equalsIgnoreCase(host)) { return false; } - } catch (final MalformedURLException e) { + } catch (final IllegalArgumentException e) { return false; } return true; @@ -725,9 +725,10 @@ protected String getCanonicalUrl(final ResponseData responseData, final Document */ protected String normalizeCanonicalUrl(final String baseUrl, final String canonicalUrl) { try { - final URL u = new URL(baseUrl); - return new URL(u, canonicalUrl.startsWith(":") ? u.getProtocol() + canonicalUrl : canonicalUrl).toString(); - } catch (final MalformedURLException e) { + final URI baseUri = URI.create(baseUrl); + final String resolveTarget = canonicalUrl.startsWith(":") ? baseUri.getScheme() + canonicalUrl : canonicalUrl; + return baseUri.resolve(resolveTarget).toString(); + } catch (final IllegalArgumentException e) { logger.warn("Invalid canonical URL: baseUrl={}, canonicalUrl={}", baseUrl, canonicalUrl, e); } return null; @@ -982,9 +983,9 @@ protected List getAnchorList(final Document document, final ResponseData List anchorList = new ArrayList<>(); final String baseHref = getBaseHref(document); try { - final URL url = getBaseUrl(responseData.getUrl(), baseHref); + final URI uri = getBaseUri(responseData.getUrl(), baseHref); for (final Map.Entry entry : childUrlRuleMap.entrySet()) { - for (final String u : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) { + for (final String u : getUrlFromTagAttribute(uri, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) { anchorList.add(RequestDataBuilder.newRequestData().get().url(u).build()); } } @@ -1001,18 +1002,18 @@ protected List getAnchorList(final Document document, final ResponseData } /** - * Gets the base URL for resolving relative URLs. + * Gets the base URI for resolving relative URLs. * * @param currentUrl the current URL * @param baseHref the base href value from HTML - * @return the base URL - * @throws MalformedURLException if the URL is malformed + * @return the base URI + * @throws URISyntaxException if the URI is malformed */ - protected URL getBaseUrl(final String currentUrl, final String baseHref) throws MalformedURLException { + protected URI getBaseUri(final String currentUrl, final String baseHref) throws URISyntaxException { if (baseHref != null) { - return getURL(currentUrl, baseHref); + return getURI(currentUrl, baseHref); } - return new URL(currentUrl); + return URI.create(currentUrl); } /** @@ -1089,19 +1090,19 @@ public Object getData(final AccessResultData accessResultData) { * Adds child URL from tag attribute value. * * @param urlList the list to add URLs to - * @param url the base URL for resolving relative URLs + * @param uri the base URI for resolving relative URLs * @param attrValue the attribute value containing the URL * @param encoding the character encoding */ @Override - protected void addChildUrlFromTagAttribute(final List urlList, final URL url, final String attrValue, final String encoding) { + protected void addChildUrlFromTagAttribute(final List urlList, final URI uri, final String attrValue, final String encoding) { final String urlValue = attrValue.trim(); - URL childUrl; String u = null; try { - childUrl = new URL(url, urlValue.startsWith(":") ? url.getProtocol() + urlValue : urlValue); - u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding); - } catch (final MalformedURLException e) { + final String resolveTarget = urlValue.startsWith(":") ? uri.getScheme() + urlValue : urlValue; + final URI childUri = uri.resolve(resolveTarget); + u = encodeUrl(normalizeUrl(childUri.toString()), encoding); + } catch (final IllegalArgumentException e) { final int pos = urlValue.indexOf(':'); if (pos > 0 && pos < 10) { u = encodeUrl(normalizeUrl(urlValue), encoding); @@ -1109,7 +1110,7 @@ protected void addChildUrlFromTagAttribute(final List urlList, final URL } if (u == null) { - logger.warn("Ignored child URL: childUrl={}, parentUrl={}", attrValue, url); + logger.warn("Ignored child URL: childUrl={}, parentUri={}", attrValue, uri); return; } @@ -1161,9 +1162,9 @@ protected String getThumbnailUrl(final ResponseData responseData, final Document if (thumbnailNode != null) { final String content = thumbnailNode.getTextContent(); if (StringUtil.isNotBlank(content)) { - final URL thumbnailUrl = getURL(responseData.getUrl(), content); - if (thumbnailUrl != null) { - return thumbnailUrl.toExternalForm(); + final URI thumbnailUri = getURI(responseData.getUrl(), content); + if (thumbnailUri != null) { + return thumbnailUri.toString(); } } } @@ -1173,9 +1174,9 @@ protected String getThumbnailUrl(final ResponseData responseData, final Document if (ogImageNode != null) { final String content = ogImageNode.getTextContent(); if (StringUtil.isNotBlank(content)) { - final URL thumbnailUrl = getURL(responseData.getUrl(), content); - if (thumbnailUrl != null) { - return thumbnailUrl.toExternalForm(); + final URI thumbnailUri = getURI(responseData.getUrl(), content); + if (thumbnailUri != null) { + return thumbnailUri.toString(); } } } @@ -1227,9 +1228,9 @@ protected String getThumbnailSrc(final String url, final NamedNodeMap attributes final Node srcNode = attributes.getNamedItem("src"); if (srcNode != null) { try { - final URL thumbnailUrl = getURL(url, srcNode.getTextContent()); - if (thumbnailUrl != null) { - return thumbnailUrl.toExternalForm(); + final URI thumbnailUri = getURI(url, srcNode.getTextContent()); + if (thumbnailUri != null) { + return thumbnailUri.toString(); } } catch (final Exception e) { if (logger.isDebugEnabled()) { @@ -1267,27 +1268,27 @@ protected Integer getAttributeAsInteger(final NamedNodeMap attributes, final Str } /** - * Creates a URL object from the current URL and a relative or absolute URL string. + * Creates a URI object from the current URL and a relative or absolute URL string. * * @param currentUrl the current URL as base * @param url the URL string to process - * @return the URL object - * @throws MalformedURLException if the URL is malformed + * @return the URI object + * @throws URISyntaxException if the URI is malformed */ - protected URL getURL(final String currentUrl, final String url) throws MalformedURLException { + protected URI getURI(final String currentUrl, final String url) throws URISyntaxException { if (url != null) { if (url.startsWith("://")) { final String protocol = currentUrl.split(":")[0]; - return new URL(protocol + url); + return URI.create(protocol + url); } if (url.startsWith("//")) { final String protocol = currentUrl.split(":")[0]; - return new URL(protocol + ":" + url); + return URI.create(protocol + ":" + url); } if (url.startsWith("/") || url.indexOf(':') == -1) { - return new URL(new URL(currentUrl), url); + return URI.create(currentUrl).resolve(url); } - return new URL(url); + return URI.create(url); } return null; } diff --git a/src/main/java/org/codelibs/fess/helper/ProtocolHelper.java b/src/main/java/org/codelibs/fess/helper/ProtocolHelper.java index 1b81101c9..68bca14ea 100644 --- a/src/main/java/org/codelibs/fess/helper/ProtocolHelper.java +++ b/src/main/java/org/codelibs/fess/helper/ProtocolHelper.java @@ -22,7 +22,8 @@ import java.io.IOException; import java.lang.reflect.Field; import java.net.JarURLConnection; -import java.net.URL; +import java.net.URI; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; import java.util.Enumeration; @@ -96,24 +97,32 @@ protected void loadProtocols(final String basePackage) { final String path = basePackage.replace('.', '/'); final ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); try { - final Enumeration resources = classLoader.getResources(path); + final Enumeration resources = classLoader.getResources(path); while (resources.hasMoreElements()) { - final URL resource = resources.nextElement(); + final java.net.URL resource = resources.nextElement(); logger.debug("Loading resource: url={}", resource); - if ("file".equals(resource.getProtocol())) { - final File directory = new File(resource.getFile()); + final URI resourceUri; + try { + resourceUri = resource.toURI(); + } catch (final URISyntaxException e) { + logger.warn("Invalid URI for resource: url={}", resource, e); + continue; + } + + if ("file".equals(resourceUri.getScheme())) { + final File directory = new File(resourceUri); if (directory.exists() && directory.isDirectory()) { final File[] files = directory.listFiles(File::isDirectory); if (files != null) { for (final File file : files) { final String name = file.getName(); subPackages.add(name); - logger.debug("Found subpackage: name={}, resource={}", name, resource); + logger.debug("Found subpackage: name={}, resource={}", name, resourceUri); } } } - } else if ("jar".equals(resource.getProtocol())) { + } else if ("jar".equals(resourceUri.getScheme())) { final JarURLConnection jarURLConnection = (JarURLConnection) resource.openConnection(); try (JarFile jarFile = jarURLConnection.getJarFile()) { final Enumeration entries = jarFile.entries(); @@ -124,7 +133,7 @@ protected void loadProtocols(final String basePackage) { final String name = entryName.substring(path.length() + 1, entryName.length() - 1); if (name.indexOf('/') == -1) { subPackages.add(name); - logger.debug("Found subpackage: name={}, resource={}", name, resource); + logger.debug("Found subpackage: name={}, resource={}", name, resourceUri); } } } diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java index 51d6c7dd1..c0fc881ea 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java @@ -18,7 +18,7 @@ import java.io.ByteArrayInputStream; import java.io.StringWriter; import java.lang.reflect.Field; -import java.net.URL; +import java.net.URI; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -808,39 +808,39 @@ public void test_normalizeCanonicalUrl() throws Exception { assertEquals("http://hoge.com/aaa", value); } - public void test_getBaseUrl() throws Exception { + public void test_getBaseUri() throws Exception { final FessXpathTransformer transformer = new FessXpathTransformer(); - URL value; + URI value; - value = transformer.getBaseUrl("http://hoge.com/", null); - assertEquals("http://hoge.com/", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/", null); + assertEquals("http://hoge.com/", value.toString()); - value = transformer.getBaseUrl("http://hoge.com/", "http://hoge.com/"); - assertEquals("http://hoge.com/", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/", "http://hoge.com/"); + assertEquals("http://hoge.com/", value.toString()); - value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/"); - assertEquals("http://hoge.com/", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/aaa/bbb.html", "http://hoge.com/"); + assertEquals("http://hoge.com/", value.toString()); - value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/ccc/"); - assertEquals("http://hoge.com/ccc/", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/aaa/bbb.html", "http://hoge.com/ccc/"); + assertEquals("http://hoge.com/ccc/", value.toString()); - value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", null); - assertEquals("http://hoge.com/aaa/bbb.html", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/aaa/bbb.html", null); + assertEquals("http://hoge.com/aaa/bbb.html", value.toString()); - value = transformer.getBaseUrl("http://hoge.com/", "://hoge.com/aaa/"); - assertEquals("http://hoge.com/aaa/", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/", "://hoge.com/aaa/"); + assertEquals("http://hoge.com/aaa/", value.toString()); - value = transformer.getBaseUrl("https://hoge.com/", "://hoge.com/aaa/"); - assertEquals("https://hoge.com/aaa/", value.toExternalForm()); + value = transformer.getBaseUri("https://hoge.com/", "://hoge.com/aaa/"); + assertEquals("https://hoge.com/aaa/", value.toString()); - value = transformer.getBaseUrl("http://hoge.com/", "//hoge.com/aaa/"); - assertEquals("http://hoge.com/aaa/", value.toExternalForm()); + value = transformer.getBaseUri("http://hoge.com/", "//hoge.com/aaa/"); + assertEquals("http://hoge.com/aaa/", value.toString()); - value = transformer.getBaseUrl("https://hoge.com/", "//hoge.com/aaa/"); - assertEquals("https://hoge.com/aaa/", value.toExternalForm()); + value = transformer.getBaseUri("https://hoge.com/", "//hoge.com/aaa/"); + assertEquals("https://hoge.com/aaa/", value.toString()); - value = transformer.getBaseUrl("https://hoge.com/", "aaa/"); - assertEquals("https://hoge.com/aaa/", value.toExternalForm()); + value = transformer.getBaseUri("https://hoge.com/", "aaa/"); + assertEquals("https://hoge.com/aaa/", value.toString()); } public void test_getThumbnailUrl_no() throws Exception {