Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LWTA abbreviation support #12109

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
61 changes: 61 additions & 0 deletions src/main/java/org/jabref/logic/journals/AbbreviationParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ public class AbbreviationParser {
// Ensures ordering while preventing duplicates
private final LinkedHashSet<Abbreviation> abbreviations = new LinkedHashSet<>();

private final LinkedHashSet<LwtaAbbreviation> lwtaAbbreviations = new LinkedHashSet<>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subclass the AbbreviationParser, because LWTA is a separate functionality (too little coupoling with other methods). someone will say that there should be composition over inheritance - not sure which method you really need of this class.


/*
* Read the given file, which should contain a list of journal names and their abbreviations. Each line should be
* formatted as: "Full Journal Name,Abbr. Journal Name[,Shortest Unique Abbreviation]"
Expand Down Expand Up @@ -49,6 +51,51 @@ void readJournalListFromFile(Path file) throws IOException {
}
}

void readLwtaAbbreviations(Path file) throws IOException {
char delimiter = detectDelimiter(file);

try (CSVParser csvParser = new CSVParser(Files.newBufferedReader(file, StandardCharsets.UTF_8), AbbreviationFormat.getCSVFormatWithDelimiter(delimiter))) {
for (CSVRecord csvRecord : csvParser) {
String name = csvRecord.size() > 0 ? csvRecord.get(0) : "";
String abbreviation = csvRecord.size() > 1 ? csvRecord.get(1) : "";

// Check name and abbreviation
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove comment - this is clear from the statement.

if (name.isEmpty() || abbreviation.isEmpty()) {
return;
}
LwtaAbbreviation.Position position;

if (name.endsWith("-") && name.startsWith("-")) {
position = LwtaAbbreviation.Position.IN_WORD;
} else if (name.endsWith("-")) {
position = LwtaAbbreviation.Position.STARTS_WORD;
} else if (name.startsWith("-")) {
position = LwtaAbbreviation.Position.ENDS_WORD;
} else {
position = LwtaAbbreviation.Position.FULL_WORD;
}

boolean allowsPrefix = false;
boolean allowsSuffix = false;

if (abbreviation.startsWith("-")) {
allowsPrefix = true;
}

if (abbreviation.endsWith("-")) {
allowsSuffix = true;
}

if ("n.a.".equals(abbreviation)) {
abbreviation = name;
}

LwtaAbbreviation abbreviationToAdd = new LwtaAbbreviation(removeHyphens(name), removeHyphens(abbreviation), position, allowsPrefix, allowsSuffix);
lwtaAbbreviations.add(abbreviationToAdd);
}
}
}

private char detectDelimiter(Path file) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) {
String line = reader.readLine();
Expand All @@ -63,7 +110,21 @@ private char detectDelimiter(Path file) throws IOException {
}
}

private static String removeHyphens(String string) {
if (string.startsWith("-")) {
string = string.substring(1);
}
if (string.endsWith("-")) {
string = string.substring(0, string.length() - 1);
}
Comment on lines +117 to +119
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just use

string = org.jabref.model.strings.StringUtil.removeStringAtTheEnd(string, "-");

(replace by normal import - i just want tos show you the package)

return string;
}

public Collection<Abbreviation> getAbbreviations() {
return abbreviations;
}

public Collection<LwtaAbbreviation> getLwtaAbbreviations() {
return lwtaAbbreviations;
}
}
41 changes: 41 additions & 0 deletions src/main/java/org/jabref/logic/journals/LwtaAbbreviation.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.jabref.logic.journals;

public class LwtaAbbreviation {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactor to record

private final String unAbbreviated;
private final String abbreviation;
private final Position position;
private final boolean allowsSuffix;
private final boolean allowsPrefix;

enum Position {
ENDS_WORD, STARTS_WORD, IN_WORD, FULL_WORD
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sort more logically: full, starts, in, end

}

LwtaAbbreviation(String unAbbreviated, String abbreviation, Position position, boolean allowsPrefix, boolean allowsSuffix) {
this.unAbbreviated = unAbbreviated;
this.abbreviation = abbreviation;
this.position = position;
this.allowsPrefix = allowsPrefix;
this.allowsSuffix = allowsSuffix;
}

public String getAbbreviation() {
return abbreviation;
}

public String getUnAbbreviated() {
return unAbbreviated;
}

public Position getPosition() {
return position;
}

public boolean getAllowsPrefix() {
return allowsPrefix;
}

public boolean getAllowsSuffix() {
return allowsSuffix;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
package org.jabref.logic.journals;

import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class LwtaAbbreviationRepository {

private final Map<String, LwtaAbbreviation> lwtaToAbbreviationObject;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove Object at the end. Nearly everything is an object in Java.


// incomplete list
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is that for an "incomplete" list. What should one do to add words? Either state that or remove the comment.

private final String[] WORDS_TO_REMOVE = new String[]{"the", "and", "&", "of", "but", "sans", "section", "series", "part"};

/**
* instantiates this class with a csv file
*/
Comment on lines +20 to +22
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove comment - or replace it by an explenation of the @param file parameter

public LwtaAbbreviationRepository(Path file) throws IOException {
AbbreviationParser parser = new AbbreviationParser();
lwtaToAbbreviationObject = new HashMap<>();

parser.readLwtaAbbreviations(file);
Collection<LwtaAbbreviation> abbreviations = parser.getLwtaAbbreviations();

for (LwtaAbbreviation abbreviation : abbreviations) {
lwtaToAbbreviationObject.put(abbreviation.getUnAbbreviated(), abbreviation);
}
}

/**
* returns true if the abbreviation can be applied to the word given
*/
private boolean canAbbreviate(String word, LwtaAbbreviation lwtaAbbreviation) {
return switch (lwtaAbbreviation.getPosition()) {
case IN_WORD ->
word.contains(lwtaAbbreviation.getUnAbbreviated());
case STARTS_WORD ->
word.startsWith(lwtaAbbreviation.getUnAbbreviated());
case ENDS_WORD ->
word.endsWith(lwtaAbbreviation.getUnAbbreviated());
default ->
false;
};
}

/**
* abbreviates the word with the set of abbreviations. Recursive, prefers abbreviating the longest chunks first.
*/
private String abbreviateWord(String word, Set<String> abbreviations) {
List<String> possibleAbbreviations = new ArrayList<>();

// We need to keep capitalisation for our final result, but lower case allows us to compare
boolean capitalised = Character.isUpperCase(word.charAt(0));
String wordLowerCase = word.toLowerCase();

for (String abbreviation : abbreviations) {
LwtaAbbreviation lwtaAbbreviation = lwtaToAbbreviationObject.get(abbreviation);

if (canAbbreviate(wordLowerCase, lwtaAbbreviation)) {
possibleAbbreviations.add(abbreviation);
}
}

if (possibleAbbreviations.isEmpty()) {
return word;
}

// Now we have to decide conflicts -- for example, maybe both "balti-" and "baltimore" are matched. We'll go by the longer abbreviation first
possibleAbbreviations.sort((String string1, String string2) -> string2.length() - string1.length());
LwtaAbbreviation abbreviationUsed = lwtaToAbbreviationObject.get(possibleAbbreviations.getFirst());

Set<String> possibleAbbSet = new HashSet<>(possibleAbbreviations);
String wordAbb = "";

switch (abbreviationUsed.getPosition()) {
case ENDS_WORD -> {
for (int i = 0; i < word.length(); i++) {
String head = word.substring(0, i);
String tail = word.substring(i);
if (tail.equalsIgnoreCase(abbreviationUsed.getUnAbbreviated())) {
String prefix = "";
if (abbreviationUsed.getAllowsPrefix()) {
prefix = abbreviateWord(head, possibleAbbSet);
}

wordAbb = prefix + abbreviationUsed.getAbbreviation();
}
}
}
case STARTS_WORD -> {
for (int i = 0; i < word.length(); i++) {
String head = word.substring(0, i);
String tail = word.substring(i);
if (head.equalsIgnoreCase(abbreviationUsed.getUnAbbreviated())) {
String suffix = "";
if (abbreviationUsed.getAllowsSuffix()) {
suffix = abbreviateWord(tail, possibleAbbSet);
}

wordAbb = abbreviationUsed.getAbbreviation() + suffix;
}
}
}
case IN_WORD -> {
String[] unAbbreviatedPieces = word.split(abbreviationUsed.getAbbreviation(), 2);
if (unAbbreviatedPieces.length == 0) {
return abbreviationUsed.getAbbreviation();
}
String head = unAbbreviatedPieces[0];
String tail = unAbbreviatedPieces[1];
String prefix = "";
if (abbreviationUsed.getAllowsPrefix()) {
prefix = abbreviateWord(head, possibleAbbSet);
}
String suffix = "";
if (abbreviationUsed.getAllowsSuffix()) {
suffix = abbreviateWord(tail, possibleAbbSet);
}
wordAbb = prefix + abbreviationUsed.getAbbreviation() + suffix;
}
default ->
wordAbb = word;
}

// Now capitalise the abbreviation correctly:
if (capitalised && wordAbb.length() > 0) {
wordAbb = wordAbb.substring(0, 1).toUpperCase() + wordAbb.substring(1);
}

return wordAbb;
}

/**
* turns a journal name into its lwta abbreviation
*/
String abbreviateJournalName(String name) {
// Remove commas and replace full stops with commas
name = name.replace(",", "");
name = name.replace(".", ",");

// Split into words:
String[] words = name.split(" ");
ArrayList<String> wordsToBeAbbreviated = new ArrayList<>();
ArrayList<String> abbreviatedWords = new ArrayList<>();

// Remove articles/prepositions
for (String word : words) {
boolean removeWord = false;

for (String wordToRemove : WORDS_TO_REMOVE) {
if (word.equalsIgnoreCase(wordToRemove)) {
removeWord = true;
break;
}
}

if (!removeWord) {
wordsToBeAbbreviated.add(word);
}
}

// Single word titles should not be abbreviated:
if (wordsToBeAbbreviated.size() == 1) {
return wordsToBeAbbreviated.getFirst();
}

// Abbreviate each word:
for (String word : wordsToBeAbbreviated) {
String abbreviated = word;
boolean abbreviatedAlready = false;
for (String unAbbreviated : lwtaToAbbreviationObject.keySet()) {
// If the word is just punctuation and an abbreviation, just use that:
String lowerWord = word.toLowerCase();
if (lowerWord.contains(unAbbreviated.toLowerCase()) && lowerWord.replaceAll("[^\\sa-zA-Z0-9]", "").equalsIgnoreCase(unAbbreviated)) {
abbreviated = lwtaToAbbreviationObject.get(unAbbreviated).getAbbreviation();
// Fix capitalisation:
if (Character.isUpperCase(word.charAt(0))) {
abbreviated = abbreviated.substring(0, 1).toUpperCase() + abbreviated.substring(1);
}

abbreviatedAlready = true;
break;
}
}

if (!abbreviatedAlready) {
abbreviated = abbreviateWord(word, lwtaToAbbreviationObject.keySet());
}

abbreviatedWords.add(abbreviated);
}

// put the abbreviated words back together
StringBuilder sb = new StringBuilder();
for (int i = 0; i < abbreviatedWords.size() - 1; i++) {
sb.append(abbreviatedWords.get(i));
sb.append(" ");
}
sb.append(abbreviatedWords.getLast());

return sb.toString();
}
}

Loading
Loading