From b082bcf5f51a8014ce8868bacb15ea85171e104e Mon Sep 17 00:00:00 2001 From: JH Park Date: Tue, 19 Nov 2024 23:53:33 +0900 Subject: [PATCH] =?UTF-8?q?Refactor:=20=EC=9B=B9=20=ED=81=AC=EB=A1=A4?= =?UTF-8?q?=EB=A7=81=20=EB=A1=9C=EC=BB=AC=20=ED=99=98=EA=B2=BD=EC=97=90?= =?UTF-8?q?=EC=84=9C=20=EC=A0=95=EC=83=81=20=EC=9E=91=EB=8F=99=ED=95=98?= =?UTF-8?q?=EB=8A=94=20ver=20=EB=B3=B5=EA=B5=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../SeleniumService/WebCrawlerService.java | 148 +++++++++++++----- 1 file changed, 106 insertions(+), 42 deletions(-) diff --git a/src/main/java/Capstone/AutoScheduler/global/service/SeleniumService/WebCrawlerService.java b/src/main/java/Capstone/AutoScheduler/global/service/SeleniumService/WebCrawlerService.java index 3891dbc..a1e74bd 100644 --- a/src/main/java/Capstone/AutoScheduler/global/service/SeleniumService/WebCrawlerService.java +++ b/src/main/java/Capstone/AutoScheduler/global/service/SeleniumService/WebCrawlerService.java @@ -1,3 +1,67 @@ +package Capstone.AutoScheduler.global.service.SeleniumService; + +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; +import org.springframework.stereotype.Service; + +import java.net.MalformedURLException; +import java.net.URL; +import java.time.Duration; + +@Service +public class WebCrawlerService { + + public String getHtmlContent(String url) { + WebDriver driver = null; + + try { + validateUrl(url); + + // ChromeDriver 설정 + ChromeOptions options = new ChromeOptions(); + options.addArguments("--headless"); // GUI 없이 실행 + options.addArguments("--no-sandbox"); // 보안 설정 + options.addArguments("--disable-dev-shm-usage"); // 메모리 문제 방지 + options.addArguments("--disable-gpu"); //추가한 옵션 + options.addArguments("--ignore-ssl-errors=yes"); + options.addArguments("--ignore-certificate-errors"); + + driver = new ChromeDriver(options); + + // URL 로드 + driver.get(url); + + // 페이지 로드 대기 + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); + wait.until(ExpectedConditions.presenceOfElementLocated(org.openqa.selenium.By.tagName("body"))); + + // HTML 소스 반환 + return driver.getPageSource(); + } catch (MalformedURLException e) { + throw new IllegalArgumentException("유효하지 않은 URL 형식입니다: " + url); + } catch (Exception e) { + throw new RuntimeException("크롤링 중 오류가 발생했습니다: " + e.getMessage()); + } finally { + if (driver != null) { + driver.quit(); // WebDriver 종료 + } + } + } + + private void validateUrl(String url) throws MalformedURLException { + URL parsedUrl = new URL(url); + + if (!"https".equalsIgnoreCase(parsedUrl.getProtocol())) { + throw new IllegalArgumentException("HTTPS 프로토콜만 지원합니다."); + } + } +} + + + //package Capstone.AutoScheduler.global.service.SeleniumService; // //import org.openqa.selenium.WebDriver; @@ -56,45 +120,45 @@ //} -package Capstone.AutoScheduler.global.service.SeleniumService; - -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeOptions; -import org.openqa.selenium.remote.DesiredCapabilities; -import org.openqa.selenium.remote.RemoteWebDriver; -import org.springframework.stereotype.Service; - -import java.net.MalformedURLException; -import java.net.URL; - -@Service -public class WebCrawlerService { - - public String getHtmlContent(String url) throws MalformedURLException { - // Selenium Grid의 URL 설정 (EC2 퍼블릭 IP를 사용) - String seleniumGridUrl = "http://3.35.252.162:4444/wd/hub"; - - // ChromeOptions 설정 - ChromeOptions options = new ChromeOptions(); - options.addArguments("--headless"); // 헤드리스 모드로 실행 (브라우저 창이 뜨지 않음) - options.addArguments("--no-sandbox"); - options.addArguments("--disable-dev-shm-usage"); - - // WebDriver 설정 (Selenium Grid에 연결) - //WebDriver driver = new RemoteWebDriver(new URL(seleniumGridUrl), new ChromeOptions()); - WebDriver driver = new RemoteWebDriver(new URL("http://selenium-chrome:4444/wd/hub"), options); - - try { - // 지정된 URL로 이동 - driver.get(url); - - // 페이지의 HTML 소스 가져오기 - String pageSource = driver.getPageSource(); - - return pageSource; - } finally { - // WebDriver 종료 - driver.quit(); - } - } -} +//package Capstone.AutoScheduler.global.service.SeleniumService; +// +//import org.openqa.selenium.WebDriver; +//import org.openqa.selenium.chrome.ChromeOptions; +//import org.openqa.selenium.remote.DesiredCapabilities; +//import org.openqa.selenium.remote.RemoteWebDriver; +//import org.springframework.stereotype.Service; +// +//import java.net.MalformedURLException; +//import java.net.URL; +// +//@Service +//public class WebCrawlerService { +// +// public String getHtmlContent(String url) throws MalformedURLException { +// // Selenium Grid의 URL 설정 (EC2 퍼블릭 IP를 사용) +// String seleniumGridUrl = "http://3.35.252.162:4444/wd/hub"; +// +// // ChromeOptions 설정 +// ChromeOptions options = new ChromeOptions(); +// options.addArguments("--headless"); // 헤드리스 모드로 실행 (브라우저 창이 뜨지 않음) +// options.addArguments("--no-sandbox"); +// options.addArguments("--disable-dev-shm-usage"); +// +// // WebDriver 설정 (Selenium Grid에 연결) +// //WebDriver driver = new RemoteWebDriver(new URL(seleniumGridUrl), new ChromeOptions()); +// WebDriver driver = new RemoteWebDriver(new URL("http://selenium-chrome:4444/wd/hub"), options); +// +// try { +// // 지정된 URL로 이동 +// driver.get(url); +// +// // 페이지의 HTML 소스 가져오기 +// String pageSource = driver.getPageSource(); +// +// return pageSource; +// } finally { +// // WebDriver 종료 +// driver.quit(); +// } +// } +//}