JabRef · koppor · Oct 30, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -1,5 +1,6 @@
 package org.jabref.logic.importer.fileformat;
 
+import java.awt.Rectangle;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.StringWriter;
@@ -27,8 +28,11 @@
 import org.jabref.model.strings.StringUtil;
 
 import com.google.common.base.Strings;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.PDFTextStripperByArea;
 
 /**
  * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry.
@@ -196,7 +200,8 @@ public ParserResult importDatabase(Path filePath) {
         List<BibEntry> result = new ArrayList<>(1);
         try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) {
             String firstPageContents = getFirstPageContents(document);
-            Optional<BibEntry> entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE);
+            String title = getTitleByArea(document);
+            Optional<BibEntry> entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE, title);
             entry.ifPresent(result::add);
         } catch (EncryptedPdfsNotSupportedException e) {
             return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
@@ -208,8 +213,72 @@ public ParserResult importDatabase(Path filePath) {
         return new ParserResult(result);
     }
 
+//    private String guessBetterTitleInMetaData(List<String> metadata) {
+//        String probableTitle = null;
+//        int maxScore = 0;
+//
+//        for (String str : metadata) {
+//            if (str == null) {
+//                continue;
+//            }
+//            // Rule 1: Check for file type paths, ignore them
+//            if (str.contains(".pdf") || str.contains(".docx") || str.contains(".doc")) {
+//                continue;
+//            }
+//            // Rule 2: Abstract detection (too long for a title)
+//            if (str.length() > 300) {
+//                continue;
+//            }
+//            // Rule 3: Title length and academic keywords (heuristic)
+//            int score = 0;
+//            score += str.length(); // Titles tend to be longer
+//            score += countAcademicKeywords(str); // Bonus for academic terms
+//
+//            if (score > maxScore) {
+//                maxScore = score;
+//                probableTitle = str;
+//            }
+//        }
+//
+//        return probableTitle;
+//    }
+
+    // Count common academic keywords
+//    private int countAcademicKeywords(String str) {
+//        List<String> keywords = Arrays.asList("study", "exploring", "research", "development", "design", "learning");
+//        int count = 0;
+//        for (String keyword : keywords) {
+//            if (str.toLowerCase().contains(keyword)) {
+//                count++;
+//            }
+//        }
+//        return count;
+//    }
+
+//    private List<String> buildMetaData(
+//            String author, String editor, String abstractT, String keywords, String title,
+//            String conference, String doi, String series, String volume, String number,
+//            String pages, String year, String publisher) {
+//        List<String> metadataList = new ArrayList<>();
+//        metadataList.add(author);
+//        metadataList.add(editor);
+//        metadataList.add(abstractT);
+//        metadataList.add(keywords);
+//        metadataList.add(title);
+//        metadataList.add(conference);
+//        metadataList.add(doi);
+//        metadataList.add(series);
+//        metadataList.add(volume);
+//        metadataList.add(number);
+//        metadataList.add(pages);
+//        metadataList.add(year);
+//        metadataList.add(publisher);
+//
+//        return metadataList;
+//    }
+
     // make this method package visible so we can test it
-    Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineSeparator) {
+    Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineSeparator, String titleByPosition) {
         // idea: split[] contains the different lines
         // blocks are separated by empty lines
         // treat each block
@@ -437,7 +506,8 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             entry.setField(StandardField.KEYWORDS, keywords);
         }
         if (title != null) {
-            entry.setField(StandardField.TITLE, title);
+//            title = guessBetterTitleInMetaData(buildMetaData(author, editor, abstractT, keywords, title, conference, doi, series, volume, number, pages, year, publisher));
+            entry.setField(StandardField.TITLE, (StringUtils.isBlank(titleByPosition)) ? title : titleByPosition);
         }
         if (conference != null) {
             entry.setField(StandardField.BOOKTITLE, conference);
@@ -493,6 +563,16 @@ private String getFirstPageContents(PDDocument document) throws IOException {
         return writer.toString();
     }
 
+    private String getTitleByArea(PDDocument document) throws IOException {
+        PDPage firstPage = document.getPage(0);
+        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+        stripper.setSortByPosition(true);
+        Rectangle titleArea = new Rectangle(50, 50, 500, 100);
+        stripper.addRegion("title", titleArea);
+        stripper.extractRegions(firstPage);
+        return stripper.getTextForRegion("title").trim();
+    }
+
     /**
      * Extract the year out of curString (if it is not yet defined)
      */

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -2,6 +2,7 @@
 
 import java.nio.file.Path;
 import java.util.List;
+import java.util.Objects;
 import java.util.Optional;
 
 import org.jabref.model.entry.BibEntry;
@@ -15,7 +16,7 @@
 
 class PdfContentImporterTest {
 
-    private PdfContentImporter importer = new PdfContentImporter();
+    private final PdfContentImporter importer = new PdfContentImporter();
 
     @Test
     void doesNotHandleEncryptedPdfs() throws Exception {
@@ -65,7 +66,7 @@ void parsingEditorWithoutPagesorSeriesInformation() {
                 Corpus linguistics investigates human language by starting out from large
                 """;
 
-        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n"));
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n", ""));
     }
 
     @Test
@@ -88,7 +89,7 @@ Smith, Lucy Anna (2014) Mortality in the Ornamental Fish Retail Sector: an Analy
                 UNSPECIFIED
                 Master of Research (MRes) thesis, University of Kent,.""";
 
-        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n"));
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n", ""));
     }
 
     @Test
@@ -121,6 +122,14 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296
                 British Journal of Nutrition
                 https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press""";
 
-        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", ""));
+    }
+
+    @Test
+    void se2Pdf() throws Exception {
+        Path file = Path.of(Objects.requireNonNull(PdfContentImporter.class.getResource("/pdfs/se2paper.pdf")).toURI());
+        List<BibEntry> result = importer.importDatabase(file).getDatabase().getEntries();
+        assertEquals(Optional.of("On How We Can Teach – Exploring New Ways in\n" +
+                "Professional Software Development for Students"), result.getFirst().getTitle());
     }
 }
diff --git a/src/test/resources/pdfs/se2paper.pdf b/src/test/resources/pdfs/se2paper.pdf