-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add LWTA abbreviation support #12109
base: main
Are you sure you want to change the base?
Changes from 8 commits
40ae128
1246973
463d8f4
d412e9b
fbb5b6b
809aa4d
4aa2fa7
42132bb
2a8a182
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,8 @@ public class AbbreviationParser { | |
// Ensures ordering while preventing duplicates | ||
private final LinkedHashSet<Abbreviation> abbreviations = new LinkedHashSet<>(); | ||
|
||
private final LinkedHashSet<LwtaAbbreviation> lwtaAbbreviations = new LinkedHashSet<>(); | ||
|
||
/* | ||
* Read the given file, which should contain a list of journal names and their abbreviations. Each line should be | ||
* formatted as: "Full Journal Name,Abbr. Journal Name[,Shortest Unique Abbreviation]" | ||
|
@@ -49,6 +51,51 @@ void readJournalListFromFile(Path file) throws IOException { | |
} | ||
} | ||
|
||
void readLwtaAbbreviations(Path file) throws IOException { | ||
char delimiter = detectDelimiter(file); | ||
|
||
try (CSVParser csvParser = new CSVParser(Files.newBufferedReader(file, StandardCharsets.UTF_8), AbbreviationFormat.getCSVFormatWithDelimiter(delimiter))) { | ||
for (CSVRecord csvRecord : csvParser) { | ||
String name = csvRecord.size() > 0 ? csvRecord.get(0) : ""; | ||
String abbreviation = csvRecord.size() > 1 ? csvRecord.get(1) : ""; | ||
|
||
// Check name and abbreviation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove comment - this is clear from the statement. |
||
if (name.isEmpty() || abbreviation.isEmpty()) { | ||
return; | ||
} | ||
LwtaAbbreviation.Position position; | ||
|
||
if (name.endsWith("-") && name.startsWith("-")) { | ||
position = LwtaAbbreviation.Position.IN_WORD; | ||
} else if (name.endsWith("-")) { | ||
position = LwtaAbbreviation.Position.STARTS_WORD; | ||
} else if (name.startsWith("-")) { | ||
position = LwtaAbbreviation.Position.ENDS_WORD; | ||
} else { | ||
position = LwtaAbbreviation.Position.FULL_WORD; | ||
} | ||
|
||
boolean allowsPrefix = false; | ||
boolean allowsSuffix = false; | ||
|
||
if (abbreviation.startsWith("-")) { | ||
allowsPrefix = true; | ||
} | ||
|
||
if (abbreviation.endsWith("-")) { | ||
allowsSuffix = true; | ||
} | ||
|
||
if ("n.a.".equals(abbreviation)) { | ||
abbreviation = name; | ||
} | ||
|
||
LwtaAbbreviation abbreviationToAdd = new LwtaAbbreviation(removeHyphens(name), removeHyphens(abbreviation), position, allowsPrefix, allowsSuffix); | ||
lwtaAbbreviations.add(abbreviationToAdd); | ||
} | ||
} | ||
} | ||
|
||
private char detectDelimiter(Path file) throws IOException { | ||
try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) { | ||
String line = reader.readLine(); | ||
|
@@ -63,7 +110,21 @@ private char detectDelimiter(Path file) throws IOException { | |
} | ||
} | ||
|
||
private static String removeHyphens(String string) { | ||
if (string.startsWith("-")) { | ||
string = string.substring(1); | ||
} | ||
if (string.endsWith("-")) { | ||
string = string.substring(0, string.length() - 1); | ||
} | ||
Comment on lines
+117
to
+119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just use
(replace by normal import - i just want tos show you the package) |
||
return string; | ||
} | ||
|
||
public Collection<Abbreviation> getAbbreviations() { | ||
return abbreviations; | ||
} | ||
|
||
public Collection<LwtaAbbreviation> getLwtaAbbreviations() { | ||
return lwtaAbbreviations; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package org.jabref.logic.journals; | ||
|
||
public class LwtaAbbreviation { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Refactor to |
||
private final String unAbbreviated; | ||
private final String abbreviation; | ||
private final Position position; | ||
private final boolean allowsSuffix; | ||
private final boolean allowsPrefix; | ||
|
||
enum Position { | ||
ENDS_WORD, STARTS_WORD, IN_WORD, FULL_WORD | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sort more logically: full, starts, in, end |
||
} | ||
|
||
LwtaAbbreviation(String unAbbreviated, String abbreviation, Position position, boolean allowsPrefix, boolean allowsSuffix) { | ||
this.unAbbreviated = unAbbreviated; | ||
this.abbreviation = abbreviation; | ||
this.position = position; | ||
this.allowsPrefix = allowsPrefix; | ||
this.allowsSuffix = allowsSuffix; | ||
} | ||
|
||
public String getAbbreviation() { | ||
return abbreviation; | ||
} | ||
|
||
public String getUnAbbreviated() { | ||
return unAbbreviated; | ||
} | ||
|
||
public Position getPosition() { | ||
return position; | ||
} | ||
|
||
public boolean getAllowsPrefix() { | ||
return allowsPrefix; | ||
} | ||
|
||
public boolean getAllowsSuffix() { | ||
return allowsSuffix; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
package org.jabref.logic.journals; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
public class LwtaAbbreviationRepository { | ||
|
||
private final Map<String, LwtaAbbreviation> lwtaToAbbreviationObject; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove |
||
|
||
// incomplete list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is that for an "incomplete" list. What should one do to add words? Either state that or remove the comment. |
||
private final String[] WORDS_TO_REMOVE = new String[]{"the", "and", "&", "of", "but", "sans", "section", "series", "part"}; | ||
|
||
/** | ||
* instantiates this class with a csv file | ||
*/ | ||
Comment on lines
+20
to
+22
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove comment - or replace it by an explenation of the |
||
public LwtaAbbreviationRepository(Path file) throws IOException { | ||
AbbreviationParser parser = new AbbreviationParser(); | ||
lwtaToAbbreviationObject = new HashMap<>(); | ||
|
||
parser.readLwtaAbbreviations(file); | ||
Collection<LwtaAbbreviation> abbreviations = parser.getLwtaAbbreviations(); | ||
|
||
for (LwtaAbbreviation abbreviation : abbreviations) { | ||
lwtaToAbbreviationObject.put(abbreviation.getUnAbbreviated(), abbreviation); | ||
} | ||
} | ||
|
||
/** | ||
* returns true if the abbreviation can be applied to the word given | ||
*/ | ||
private boolean canAbbreviate(String word, LwtaAbbreviation lwtaAbbreviation) { | ||
return switch (lwtaAbbreviation.getPosition()) { | ||
case IN_WORD -> | ||
word.contains(lwtaAbbreviation.getUnAbbreviated()); | ||
case STARTS_WORD -> | ||
word.startsWith(lwtaAbbreviation.getUnAbbreviated()); | ||
case ENDS_WORD -> | ||
word.endsWith(lwtaAbbreviation.getUnAbbreviated()); | ||
default -> | ||
false; | ||
}; | ||
} | ||
|
||
/** | ||
* abbreviates the word with the set of abbreviations. Recursive, prefers abbreviating the longest chunks first. | ||
*/ | ||
private String abbreviateWord(String word, Set<String> abbreviations) { | ||
List<String> possibleAbbreviations = new ArrayList<>(); | ||
|
||
// We need to keep capitalisation for our final result, but lower case allows us to compare | ||
boolean capitalised = Character.isUpperCase(word.charAt(0)); | ||
String wordLowerCase = word.toLowerCase(); | ||
|
||
for (String abbreviation : abbreviations) { | ||
LwtaAbbreviation lwtaAbbreviation = lwtaToAbbreviationObject.get(abbreviation); | ||
|
||
if (canAbbreviate(wordLowerCase, lwtaAbbreviation)) { | ||
possibleAbbreviations.add(abbreviation); | ||
} | ||
} | ||
|
||
if (possibleAbbreviations.isEmpty()) { | ||
return word; | ||
} | ||
|
||
// Now we have to decide conflicts -- for example, maybe both "balti-" and "baltimore" are matched. We'll go by the longer abbreviation first | ||
possibleAbbreviations.sort((String string1, String string2) -> string2.length() - string1.length()); | ||
LwtaAbbreviation abbreviationUsed = lwtaToAbbreviationObject.get(possibleAbbreviations.getFirst()); | ||
|
||
Set<String> possibleAbbSet = new HashSet<>(possibleAbbreviations); | ||
String wordAbb = ""; | ||
|
||
switch (abbreviationUsed.getPosition()) { | ||
case ENDS_WORD -> { | ||
for (int i = 0; i < word.length(); i++) { | ||
String head = word.substring(0, i); | ||
String tail = word.substring(i); | ||
if (tail.equalsIgnoreCase(abbreviationUsed.getUnAbbreviated())) { | ||
String prefix = ""; | ||
if (abbreviationUsed.getAllowsPrefix()) { | ||
prefix = abbreviateWord(head, possibleAbbSet); | ||
} | ||
|
||
wordAbb = prefix + abbreviationUsed.getAbbreviation(); | ||
} | ||
} | ||
} | ||
case STARTS_WORD -> { | ||
for (int i = 0; i < word.length(); i++) { | ||
String head = word.substring(0, i); | ||
String tail = word.substring(i); | ||
if (head.equalsIgnoreCase(abbreviationUsed.getUnAbbreviated())) { | ||
String suffix = ""; | ||
if (abbreviationUsed.getAllowsSuffix()) { | ||
suffix = abbreviateWord(tail, possibleAbbSet); | ||
} | ||
|
||
wordAbb = abbreviationUsed.getAbbreviation() + suffix; | ||
} | ||
} | ||
} | ||
case IN_WORD -> { | ||
String[] unAbbreviatedPieces = word.split(abbreviationUsed.getAbbreviation(), 2); | ||
if (unAbbreviatedPieces.length == 0) { | ||
return abbreviationUsed.getAbbreviation(); | ||
} | ||
String head = unAbbreviatedPieces[0]; | ||
String tail = unAbbreviatedPieces[1]; | ||
String prefix = ""; | ||
if (abbreviationUsed.getAllowsPrefix()) { | ||
prefix = abbreviateWord(head, possibleAbbSet); | ||
} | ||
String suffix = ""; | ||
if (abbreviationUsed.getAllowsSuffix()) { | ||
suffix = abbreviateWord(tail, possibleAbbSet); | ||
} | ||
wordAbb = prefix + abbreviationUsed.getAbbreviation() + suffix; | ||
} | ||
default -> | ||
wordAbb = word; | ||
} | ||
|
||
// Now capitalise the abbreviation correctly: | ||
if (capitalised && wordAbb.length() > 0) { | ||
wordAbb = wordAbb.substring(0, 1).toUpperCase() + wordAbb.substring(1); | ||
} | ||
|
||
return wordAbb; | ||
} | ||
|
||
/** | ||
* turns a journal name into its lwta abbreviation | ||
*/ | ||
String abbreviateJournalName(String name) { | ||
// Remove commas and replace full stops with commas | ||
name = name.replace(",", ""); | ||
name = name.replace(".", ","); | ||
|
||
// Split into words: | ||
String[] words = name.split(" "); | ||
ArrayList<String> wordsToBeAbbreviated = new ArrayList<>(); | ||
ArrayList<String> abbreviatedWords = new ArrayList<>(); | ||
|
||
// Remove articles/prepositions | ||
for (String word : words) { | ||
boolean removeWord = false; | ||
|
||
for (String wordToRemove : WORDS_TO_REMOVE) { | ||
if (word.equalsIgnoreCase(wordToRemove)) { | ||
removeWord = true; | ||
break; | ||
} | ||
} | ||
|
||
if (!removeWord) { | ||
wordsToBeAbbreviated.add(word); | ||
} | ||
} | ||
|
||
// Single word titles should not be abbreviated: | ||
if (wordsToBeAbbreviated.size() == 1) { | ||
return wordsToBeAbbreviated.getFirst(); | ||
} | ||
|
||
// Abbreviate each word: | ||
for (String word : wordsToBeAbbreviated) { | ||
String abbreviated = word; | ||
boolean abbreviatedAlready = false; | ||
for (String unAbbreviated : lwtaToAbbreviationObject.keySet()) { | ||
// If the word is just punctuation and an abbreviation, just use that: | ||
String lowerWord = word.toLowerCase(); | ||
if (lowerWord.contains(unAbbreviated.toLowerCase()) && lowerWord.replaceAll("[^\\sa-zA-Z0-9]", "").equalsIgnoreCase(unAbbreviated)) { | ||
abbreviated = lwtaToAbbreviationObject.get(unAbbreviated).getAbbreviation(); | ||
// Fix capitalisation: | ||
if (Character.isUpperCase(word.charAt(0))) { | ||
abbreviated = abbreviated.substring(0, 1).toUpperCase() + abbreviated.substring(1); | ||
} | ||
|
||
abbreviatedAlready = true; | ||
break; | ||
} | ||
} | ||
|
||
if (!abbreviatedAlready) { | ||
abbreviated = abbreviateWord(word, lwtaToAbbreviationObject.keySet()); | ||
} | ||
|
||
abbreviatedWords.add(abbreviated); | ||
} | ||
|
||
// put the abbreviated words back together | ||
StringBuilder sb = new StringBuilder(); | ||
for (int i = 0; i < abbreviatedWords.size() - 1; i++) { | ||
sb.append(abbreviatedWords.get(i)); | ||
sb.append(" "); | ||
} | ||
sb.append(abbreviatedWords.getLast()); | ||
|
||
return sb.toString(); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Subclass the AbbreviationParser, because LWTA is a separate functionality (too little coupoling with other methods). someone will say that there should be composition over inheritance - not sure which method you really need of this class.