-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a title guess method to get "better" title #12018
Merged
Merged
Changes from 5 commits
Commits
Show all changes
24 commits
Select commit
Hold shift + click to select a range
7e86e9e
Add title guess method
leaf-soba 2993315
fix unit test
leaf-soba f25f43b
update unit test to JDK 21 style
leaf-soba ce3619f
update unit test
leaf-soba 9acd52d
update get title by area
leaf-soba f7a3f84
remove StringUtils.isBlank and add @AllowedToUseAwt
leaf-soba d88418e
add unit test
leaf-soba 16d94c7
change to get title by font size
leaf-soba 3cbaec2
Merge branch 'main' into close-issue-11999
leaf-soba da8472b
RemoveTestPrefix
leaf-soba f531f4e
Merge branch 'close-issue-11999' of https://github.com/leaf-soba/jabr…
leaf-soba 3a7018e
temp fix the unit test
leaf-soba 7edb9b7
fix the unit test and open rewrite issue
leaf-soba b386f85
remove commented code
leaf-soba c2b1fc4
Add 5 more unittest case
leaf-soba 7773706
resolve all comments so far
leaf-soba 65965dd
remove Blank line at start of block
leaf-soba 68eb97c
rename and replace unit test file
leaf-soba a530d40
add bib and readme.md
leaf-soba e782799
Merge branch 'main' into close-issue-11999
koppor c8e8d5f
Update CHANGELOG.md
koppor a056481
rename the file to pass CI
leaf-soba b11717a
address all comments
leaf-soba ae22b25
fix the file name in unit test
leaf-soba File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
package org.jabref.logic.importer.fileformat; | ||
|
||
import java.awt.Rectangle; | ||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.StringWriter; | ||
|
@@ -27,8 +28,11 @@ | |
import org.jabref.model.strings.StringUtil; | ||
|
||
import com.google.common.base.Strings; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.pdfbox.pdmodel.PDDocument; | ||
import org.apache.pdfbox.pdmodel.PDPage; | ||
import org.apache.pdfbox.text.PDFTextStripper; | ||
import org.apache.pdfbox.text.PDFTextStripperByArea; | ||
|
||
/** | ||
* PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. | ||
|
@@ -196,7 +200,8 @@ public ParserResult importDatabase(Path filePath) { | |
List<BibEntry> result = new ArrayList<>(1); | ||
try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) { | ||
String firstPageContents = getFirstPageContents(document); | ||
Optional<BibEntry> entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE); | ||
String title = getTitleByArea(document); | ||
Optional<BibEntry> entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE, title); | ||
entry.ifPresent(result::add); | ||
} catch (EncryptedPdfsNotSupportedException e) { | ||
return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); | ||
|
@@ -208,8 +213,72 @@ public ParserResult importDatabase(Path filePath) { | |
return new ParserResult(result); | ||
} | ||
|
||
// private String guessBetterTitleInMetaData(List<String> metadata) { | ||
// String probableTitle = null; | ||
// int maxScore = 0; | ||
// | ||
// for (String str : metadata) { | ||
// if (str == null) { | ||
// continue; | ||
// } | ||
// // Rule 1: Check for file type paths, ignore them | ||
// if (str.contains(".pdf") || str.contains(".docx") || str.contains(".doc")) { | ||
// continue; | ||
// } | ||
// // Rule 2: Abstract detection (too long for a title) | ||
// if (str.length() > 300) { | ||
// continue; | ||
// } | ||
// // Rule 3: Title length and academic keywords (heuristic) | ||
// int score = 0; | ||
// score += str.length(); // Titles tend to be longer | ||
// score += countAcademicKeywords(str); // Bonus for academic terms | ||
// | ||
// if (score > maxScore) { | ||
// maxScore = score; | ||
// probableTitle = str; | ||
// } | ||
// } | ||
// | ||
// return probableTitle; | ||
// } | ||
|
||
// Count common academic keywords | ||
// private int countAcademicKeywords(String str) { | ||
// List<String> keywords = Arrays.asList("study", "exploring", "research", "development", "design", "learning"); | ||
// int count = 0; | ||
// for (String keyword : keywords) { | ||
// if (str.toLowerCase().contains(keyword)) { | ||
// count++; | ||
// } | ||
// } | ||
// return count; | ||
// } | ||
|
||
// private List<String> buildMetaData( | ||
// String author, String editor, String abstractT, String keywords, String title, | ||
// String conference, String doi, String series, String volume, String number, | ||
// String pages, String year, String publisher) { | ||
// List<String> metadataList = new ArrayList<>(); | ||
// metadataList.add(author); | ||
// metadataList.add(editor); | ||
// metadataList.add(abstractT); | ||
// metadataList.add(keywords); | ||
// metadataList.add(title); | ||
// metadataList.add(conference); | ||
// metadataList.add(doi); | ||
// metadataList.add(series); | ||
// metadataList.add(volume); | ||
// metadataList.add(number); | ||
// metadataList.add(pages); | ||
// metadataList.add(year); | ||
// metadataList.add(publisher); | ||
// | ||
// return metadataList; | ||
// } | ||
|
||
// make this method package visible so we can test it | ||
Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineSeparator) { | ||
Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineSeparator, String titleByPosition) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After thinking, allowing |
||
// idea: split[] contains the different lines | ||
// blocks are separated by empty lines | ||
// treat each block | ||
|
@@ -437,7 +506,8 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS | |
entry.setField(StandardField.KEYWORDS, keywords); | ||
} | ||
if (title != null) { | ||
entry.setField(StandardField.TITLE, title); | ||
// title = guessBetterTitleInMetaData(buildMetaData(author, editor, abstractT, keywords, title, conference, doi, series, volume, number, pages, year, publisher)); | ||
entry.setField(StandardField.TITLE, (StringUtils.isBlank(titleByPosition)) ? title : titleByPosition); | ||
} | ||
if (conference != null) { | ||
entry.setField(StandardField.BOOKTITLE, conference); | ||
|
@@ -493,6 +563,16 @@ private String getFirstPageContents(PDDocument document) throws IOException { | |
return writer.toString(); | ||
} | ||
|
||
private String getTitleByArea(PDDocument document) throws IOException { | ||
PDPage firstPage = document.getPage(0); | ||
PDFTextStripperByArea stripper = new PDFTextStripperByArea(); | ||
stripper.setSortByPosition(true); | ||
Rectangle titleArea = new Rectangle(50, 50, 500, 100); | ||
stripper.addRegion("title", titleArea); | ||
stripper.extractRegions(firstPage); | ||
return stripper.getTextForRegion("title").trim(); | ||
} | ||
|
||
/** | ||
* Extract the year out of curString (if it is not yet defined) | ||
*/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change comment to
@VisibleForTesting