Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 27 additions & 43 deletions R/audits.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,12 @@
#' @export
dedup <- function(vault = default_vault(), threshold = 0.7) {
if (!requireNamespace("stringdist", quietly = TRUE)) {
stop("dedup() requires the 'stringdist' package.",
call. = FALSE)
stop("dedup() requires the 'stringdist' package.", call. = FALSE)
}
vault <- normalizePath(vault, mustWork = TRUE)

reg <- vault_registry(vault)
pages <- reg[!reg$system_file, , drop = FALSE]
pages <- reg[!reg$system_file,, drop = FALSE]
n <- nrow(pages)
if (n < 2L) {
proposals <- empty_dedup_result()
Expand All @@ -49,8 +48,7 @@ dedup <- function(vault = default_vault(), threshold = 0.7) {
}

titles <- ifelse(!is.na(pages$title) & nzchar(pages$title),
tolower(trimws(pages$title)),
tolower(pages$node_id))
tolower(trimws(pages$title)), tolower(pages$node_id))

page_a <- character(0L)
page_b <- character(0L)
Expand All @@ -61,7 +59,7 @@ dedup <- function(vault = default_vault(), threshold = 0.7) {
for (i in seq_len(n - 1L)) {
for (j in seq.int(i + 1L, n)) {
jw_dist <- stringdist::stringdist(titles[i], titles[j],
method = "jw")
method = "jw")
sim <- 1 - jw_dist
jacc <- tag_jaccard(pages$tags[[i]], pages$tags[[j]])
score <- 0.6 * sim + 0.4 * jacc
Expand All @@ -70,9 +68,8 @@ dedup <- function(vault = default_vault(), threshold = 0.7) {
page_b <- c(page_b, pages$path[j])
title_sim <- c(title_sim, sim)
tag_overlap <- c(
tag_overlap,
length(intersect(pages$tags[[i]],
pages$tags[[j]])))
tag_overlap,
length(intersect(pages$tags[[i]], pages$tags[[j]])))
combined <- c(combined, score)
}
}
Expand All @@ -84,8 +81,8 @@ dedup <- function(vault = default_vault(), threshold = 0.7) {
combined_score = combined,
stringsAsFactors = FALSE)
if (nrow(proposals) > 0L) {
proposals <- proposals[order(-proposals$combined_score), ,
drop = FALSE]
proposals <- proposals[order(-proposals$combined_score),,
drop = FALSE]
rownames(proposals) <- NULL
}
write_dedup_proposals(vault, proposals)
Expand All @@ -95,10 +92,8 @@ dedup <- function(vault = default_vault(), threshold = 0.7) {
#' @noRd
empty_dedup_result <- function() {
data.frame(page_a = character(0L), page_b = character(0L),
title_similarity = numeric(0L),
tag_overlap = integer(0L),
combined_score = numeric(0L),
stringsAsFactors = FALSE)
title_similarity = numeric(0L), tag_overlap = integer(0L),
combined_score = numeric(0L), stringsAsFactors = FALSE)
}

#' Jaccard overlap of two tag sets
Expand All @@ -123,13 +118,9 @@ write_dedup_proposals <- function(vault, proposals) {
out_path <- file.path(out_dir, "dedup.md")

if (nrow(proposals) == 0L) {
writeLines(c("---",
"title: Dedup proposals",
sprintf("updated: %s", now_ts()),
"---",
"",
"# Dedup proposals",
"",
writeLines(c("---", "title: Dedup proposals",
sprintf("updated: %s", now_ts()), "---", "",
"# Dedup proposals", "",
"No candidate duplicates above threshold."),
out_path)
return(invisible(out_path))
Expand All @@ -142,8 +133,7 @@ write_dedup_proposals <- function(vault, proposals) {
"",
"# Dedup proposals",
"",
sprintf("%d candidate pair(s) above threshold.",
nrow(proposals)),
sprintf("%d candidate pair(s) above threshold.", nrow(proposals)),
"")
for (i in seq_len(nrow(proposals))) {
lines <- c(lines,
Expand Down Expand Up @@ -214,7 +204,7 @@ tags <- function(vault = default_vault(), taxonomy = NULL,
}

reg <- vault_registry(vault)
pages <- reg[!reg$system_file, , drop = FALSE]
pages <- reg[!reg$system_file,, drop = FALSE]

all_tags <- unlist(pages$tags, use.names = FALSE)
if (is.null(all_tags)) {
Expand Down Expand Up @@ -245,9 +235,8 @@ tags <- function(vault = default_vault(), taxonomy = NULL,
unk_tags <- used$tag[unk_mask]
suggestion <- vapply(unk_tags,
function(t) {
near_miss(t, allowed,
near_miss_threshold)
},
near_miss(t, allowed, near_miss_threshold)
},
character(1L))
data.frame(tag = unk_tags,
count = used$count[unk_mask],
Expand All @@ -262,8 +251,7 @@ tags <- function(vault = default_vault(), taxonomy = NULL,
sort(setdiff(allowed, used$tag))
}

write_tags_proposals(vault, used, unknown, unused_taxonomy,
taxonomy)
write_tags_proposals(vault, used, unknown, unused_taxonomy, taxonomy)
invisible(list(used = used, unknown = unknown,
unused_taxonomy = unused_taxonomy))
}
Expand All @@ -280,8 +268,9 @@ read_taxonomy <- function(taxonomy_path) {
# across R's default regex engine. `\\s` is not.
pat <- "^[[:space:]]*[-*][[:space:]]+`?([^`[:space:]]+)`?[[:space:]]*$"
m <- regmatches(lines, regexec(pat, lines))
tags <- vapply(m, function(x) if (length(x) >= 2L) x[[2L]]
else NA_character_, character(1L))
tags <- vapply(m,
function(x) if (length(x) >= 2L) x[[2L]] else NA_character_,
character(1L))
tags <- tags[!is.na(tags) & nzchar(tags)]
unique(tags)
}
Expand All @@ -308,19 +297,14 @@ near_miss <- function(tag, allowed, threshold) {

#' Write the tags proposals report
#' @noRd
write_tags_proposals <- function(vault, used, unknown,
unused_taxonomy, taxonomy_path) {
write_tags_proposals <- function(vault, used, unknown, unused_taxonomy,
taxonomy_path) {
out_dir <- file.path(vault, "_proposals")
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
out_path <- file.path(out_dir, "tags.md")

lines <- c("---",
"title: Tag audit",
sprintf("updated: %s", now_ts()),
"---",
"",
"# Tag audit",
"")
lines <- c("---", "title: Tag audit", sprintf("updated: %s", now_ts()),
"---", "", "# Tag audit", "")

if (is.null(taxonomy_path) || !file.exists(taxonomy_path)) {
lines <- c(lines,
Expand All @@ -347,8 +331,7 @@ write_tags_proposals <- function(vault, used, unknown,
suggestion <- if (is.na(unknown$suggestion[i])) {
""
} else {
sprintf(" -- did you mean `%s`?",
unknown$suggestion[i])
sprintf(" -- did you mean `%s`?", unknown$suggestion[i])
}
lines <- c(lines, sprintf("- `%s` (%d)%s",
unknown$tag[i], unknown$count[i],
Expand All @@ -368,3 +351,4 @@ write_tags_proposals <- function(vault, used, unknown,
writeLines(lines, out_path)
invisible(out_path)
}

51 changes: 24 additions & 27 deletions R/autoresearch.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,9 @@ autoresearch <- function(topic, vault = default_vault(),
if (!is.character(topic) || length(topic) != 1L || !nzchar(topic)) {
stop("`topic` must be a single non-empty string.", call. = FALSE)
}
if (!is.null(slug) &&
(!is.character(slug) || length(slug) != 1L)) {
if (!is.null(slug) && (!is.character(slug) || length(slug) != 1L)) {
stop("`slug` must be NULL or a single character value (got ",
typeof(slug), " of length ", length(slug), ").",
call. = FALSE)
typeof(slug), " of length ", length(slug), ").", call. = FALSE)
}
vault <- normalizePath(vault, mustWork = TRUE)
if (!file.exists(file.path(vault, "schema.md"))) {
Expand All @@ -99,7 +97,7 @@ autoresearch <- function(topic, vault = default_vault(),
fetch_backend <- fetch_backend %||% .autoresearch_default_fetch_backend()
if (is.null(model_backend)) {
model_backend <- .autoresearch_default_model_backend(provider = provider,
model = model)
model = model)
}
model_backend <- .verbose_model_backend(model_backend, verbose)
t_start <- Sys.time()
Expand Down Expand Up @@ -134,8 +132,8 @@ autoresearch <- function(topic, vault = default_vault(),
.ar_msg(verbose, "round ", round, ": running ", nrow(round_queries),
" search ", if (nrow(round_queries) == 1L) "query" else "queries")
round_search <- autoresearch_run_searches(round_queries,
search_backend, program,
verbose = verbose)
search_backend, program,
verbose = verbose)
round_search$round <- rep.int(as.integer(round), nrow(round_search))
search_results <- .autoresearch_bind_rows(search_results, round_search)
.ar_msg(verbose, "round ", round, ": ", nrow(round_search),
Expand All @@ -144,32 +142,31 @@ autoresearch <- function(topic, vault = default_vault(),
.ar_msg(verbose, "round ", round, ": selecting sources from ",
nrow(round_search), " results")
round_selected <- autoresearch_select_sources(topic, round_search,
program, model_backend)
program, model_backend)
if (nrow(round_selected) > 0L && nrow(selected) > 0L) {
round_selected <- round_selected[
!round_selected$url %in% selected$url,, drop = FALSE]
}
round_selected$round <- rep.int(as.integer(round),
nrow(round_selected))
round_selected$round <- rep.int(as.integer(round), nrow(round_selected))
selected <- .autoresearch_bind_rows(selected, round_selected)
.ar_msg(verbose, "round ", round, ": selected ", nrow(round_selected),
" new ", if (nrow(round_selected) == 1L) "source" else "sources")

.ar_msg(verbose, "round ", round, ": fetching ",
nrow(round_selected), " ",
if (nrow(round_selected) == 1L) "source" else "sources")
if (nrow(round_selected) == 1L) "source" else "sources")
round_sources <- autoresearch_fetch_and_ingest(round_selected,
fetch_backend, vault,
topic, force = force,
verbose = verbose)
fetch_backend, vault,
topic, force = force,
verbose = verbose)
round_sources$round <- rep.int(as.integer(round), nrow(round_sources))
sources <- .autoresearch_bind_rows(sources, round_sources)

.ar_msg(verbose, "round ", round, ": extracting evidence from ",
nrow(round_sources), " ",
if (nrow(round_sources) == 1L) "source" else "sources")
if (nrow(round_sources) == 1L) "source" else "sources")
round_claims <- autoresearch_extract_claims(topic, round_sources,
program, model_backend)
program, model_backend)
round_claims$round <- rep.int(as.integer(round), nrow(round_claims))
claims <- .autoresearch_bind_rows(claims, round_claims)
.ar_msg(verbose, "round ", round, ": extracted ", nrow(round_claims),
Expand All @@ -189,20 +186,20 @@ autoresearch <- function(topic, vault = default_vault(),
}
.ar_msg(verbose, "round ", round, ": analyzing gaps")
gap_plan <- autoresearch_analyze_gaps(topic, claims, sources,
all_queries, program,
model_backend, round)
all_queries, program,
model_backend, round)
gaps <- .autoresearch_bind_rows(gaps, gap_plan$gaps)
next_queries <- gap_plan$queries
.ar_msg(verbose, "round ", round, ": ", nrow(gap_plan$gaps),
" gaps, ", nrow(gap_plan$queries),
" follow-up ",
if (nrow(gap_plan$queries) == 1L) "query" else "queries")
if (nrow(gap_plan$queries) == 1L) "query" else "queries")
}

existing_pages <- autoresearch_existing_pages(vault)
.ar_msg(verbose, "vault has ", nrow(existing_pages),
" existing wiki ",
if (nrow(existing_pages) == 1L) "page" else "pages")
if (nrow(existing_pages) == 1L) "page" else "pages")

.ar_msg(verbose, "planning pages from ", nrow(claims), " claims and ",
nrow(sources), " sources")
Expand All @@ -212,22 +209,22 @@ autoresearch <- function(topic, vault = default_vault(),
" ", if (nrow(pages$pages) == 1L) "page" else "pages")
pages$pages <- .ar_apply_user_slug(pages$pages, slug, verbose)
pages$pages <- .ar_dedupe_planned_slugs(pages$pages, topic, vault,
verbose = verbose)
verbose = verbose)

if (isTRUE(update) && nrow(pages$pages) > 0L) {
update_targets <- .ar_update_targets(pages$pages, vault)
if (length(update_targets) > 0L) {
.ar_msg(verbose, "revising ", length(update_targets),
" existing ",
if (length(update_targets) == 1L) "page" else "pages",
if (length(update_targets) == 1L) "page" else "pages",
": ", paste(update_targets, collapse = ", "))
} else {
.ar_msg(verbose, "no existing pages to revise; writing fresh")
}
pages$pages <- autoresearch_revise_pages(pages$pages, topic, claims,
sources, vault, program,
model_backend,
verbose = verbose)
sources, vault, program,
model_backend,
verbose = verbose)
}

.ar_msg(verbose, "writing ", nrow(pages$pages),
Expand Down Expand Up @@ -383,8 +380,7 @@ print.pensar_research <- function(x, ...) {
stop("TAVILY_API_KEY not set", call. = FALSE)
}
if (!requireNamespace("jsonlite", quietly = TRUE)) {
stop("Tavily search requires the 'jsonlite' package.",
call. = FALSE)
stop("Tavily search requires the 'jsonlite' package.", call. = FALSE)
}
body <- jsonlite::toJSON(
list(api_key = api_key, query = query, max_results = as.integer(n)),
Expand Down Expand Up @@ -423,3 +419,4 @@ print.pensar_research <- function(x, ...) {
stringsAsFactors = FALSE
)
}

26 changes: 11 additions & 15 deletions R/autoresearch_backends.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,17 @@
"Return JSON: {\"claims\":[{\"source_path\":\"...\",\"source_slug\":\"...\",\"claim\":\"...\",\"confidence\":\"low|medium|high\",\"quote\":\"...\"}]}",
payload, sep = "\n\n"),
analyze_gaps = paste(
"Review the current source-grounded claims and identify important research gaps.",
"Return empty arrays if the topic is sufficiently covered or if additional searches would duplicate prior queries.",
"Return JSON: {\"gaps\":[{\"gap\":\"...\",\"reason\":\"...\"}],\"queries\":[{\"query\":\"...\",\"angle\":\"...\",\"gap\":\"...\"}]}",
payload, sep = "\n\n"),
"Review the current source-grounded claims and identify important research gaps.",
"Return empty arrays if the topic is sufficiently covered or if additional searches would duplicate prior queries.",
"Return JSON: {\"gaps\":[{\"gap\":\"...\",\"reason\":\"...\"}],\"queries\":[{\"query\":\"...\",\"angle\":\"...\",\"gap\":\"...\"}]}",
payload, sep = "\n\n"),
plan_pages = paste(
"Draft the wiki pages for the research run.",
"Every non-obvious claim must cite raw source wikilinks.",
"Use only source-grounded claims, quotes, source metadata, and existing_pages. Raw source bodies are intentionally unavailable at this stage.",
"Prefer updating an existing page when existing_pages contains a matching node_id, title, alias, or page_uid.",
sprintf("Slugs MUST be derived from the actual research topic, not the literal word 'topic'. Kebab-case, ASCII letters/digits/hyphens only. A reasonable default for this run's synthesis page is '%s'.",
paste0("Research-", slugify(input$topic))),
paste0("Research-", slugify(input$topic))),
"Return JSON: {\"headline\":\"<one-sentence headline>\",\"pages\":[{\"slug\":\"<kebab-slug-derived-from-topic>\",\"title\":\"<human title for the page>\",\"type\":\"analysis\",\"source\":\"<autoresearch session reference>\",\"body\":\"<markdown body>\"}]}",
payload, sep = "\n\n"),
revise_page = paste(
Expand Down Expand Up @@ -185,11 +185,9 @@
#' @noRd
.heuristic_plan_queries <- function(input, program) {
topic <- input$topic
base <- c(topic,
paste(topic, "overview"),
base <- c(topic, paste(topic, "overview"),
paste(topic, "primary sources"),
paste(topic, "official documentation"),
paste(topic, "review"))
paste(topic, "official documentation"), paste(topic, "review"))
base <- unique(base)
max_n <- min(length(base), program$max_queries_per_round)
list(queries = lapply(seq_len(max_n), function(i) {
Expand Down Expand Up @@ -248,12 +246,9 @@
if (!nzchar(new_draft)) {
return(list(body = existing))
}
revised <- paste(existing,
"",
sprintf("## Update %s", as.character(Sys.Date())),
"",
new_draft,
sep = "\n")
revised <- paste(existing, "",
sprintf("## Update %s", as.character(Sys.Date())), "",
new_draft, sep = "\n")
list(body = revised)
}

Expand Down Expand Up @@ -331,3 +326,4 @@
}
substr(pieces[[1L]], 1L, 240L)
}

Loading