Skip to content

Commit 10f5efc

Browse files
committed
feat: remove keywords with text in square brackets in streamgraphs
1 parent 9a8f4bd commit 10f5efc

1 file changed

Lines changed: 19 additions & 1 deletion

File tree

  • server/preprocessing/other-scripts

server/preprocessing/other-scripts/base.R

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,13 @@ etl <- function(res, repo, non_public) {
248248
subject_cleaned = gsub("(wikidata)?\\.org/entity/[qQ]([\\d]+)?", "", subject_cleaned) # remove wikidata classification
249249
subject_cleaned = gsub("</keyword><keyword>", "", subject_cleaned) # remove </keyword><keyword>
250250
subject_cleaned = gsub("\\[No keyword\\]", "", subject_cleaned)
251-
subject_cleaned = gsub("\\[[^]]*\\]", "", subject_cleaned) # remove any text inside square brackets
251+
252+
if (!is.null(params$vis_type) && params$vis_type == "timeline") {
253+
subject_cleaned = remove_keywords_with_text_in_square_brackets(subject_cleaned)
254+
} else {
255+
subject_cleaned = remove_text_in_square_brackets_from_keywords(subject_cleaned)
256+
}
257+
252258
subject_cleaned = gsub("\\[[^\\[]+\\][^\\;]+(;|$)?", "", subject_cleaned) # remove classification
253259
subject_cleaned = gsub("[0-9]{2,} [A-Z]+[^;]*(;|$)?", "", subject_cleaned) #remove classification
254260
subject_cleaned = gsub(" -- ", "; ", subject_cleaned) #replace inconsistent keyword separation
@@ -357,6 +363,18 @@ decode_dctypenorm <- function(dctypestring) {
357363
return(typecodes)
358364
}
359365

366+
remove_keywords_with_text_in_square_brackets <- function(x) {
367+
# This function removes whole keywords that contain text in square brackets.
368+
# Example: 'Climate [MeSH]' | 'Some keywords [Chemical]'.
369+
gsub("[^;]*\\[[^]]+\\][^;]*;?", "", x)
370+
}
371+
372+
remove_text_in_square_brackets_from_keywords <- function(x) {
373+
# This function removes text in square brackets.
374+
# Example: 'Climate [MeSH]' -> 'Climate'| 'Some keywords [Chemical]' -> 'Some keywords'.
375+
gsub("\\[[^]]*\\]", "", x)
376+
}
377+
360378
dctypenorm_decoder <- list(
361379
"4"="Audio",
362380
"11"="Book",

0 commit comments

Comments
 (0)