From d033c9bc685034b7c8605dce1c075f60399bb594 Mon Sep 17 00:00:00 2001 From: ieshghi Date: Tue, 5 Aug 2025 13:56:54 -0400 Subject: [PATCH 1/4] added $hash function to gWalks, and added an == operator for gWalks --- NAMESPACE | 1 + R/gGnome.R | 36 ++++++++++++++++++++++++++++++++++++ man/equals.gWalk.Rd | 20 ++++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 man/equals.gWalk.Rd diff --git a/NAMESPACE b/NAMESPACE index 71de521..b800458 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method("+",gGraph) S3method("==",gEdge) S3method("==",gGraph) S3method("==",gNode) +S3method("==",gWalk) S3method("[",Junction) S3method("[",gEdge) S3method("[",gGraph) diff --git a/R/gGnome.R b/R/gGnome.R index 4cc282f..6a48095 100644 --- a/R/gGnome.R +++ b/R/gGnome.R @@ -9100,11 +9100,47 @@ gWalk = R6::R6Class("gWalk", ## GWALKS } ix = 1:self$length return(self$dts(ix)) + }, + + + #' @name hash + #' @description + #' hashes the gWalk to a string, by concatenating each walk with its reverse complement, tacking on a flag for circular or linear, then converting to strings and concatenating + hash = function() { + cn = self$dt$cn + if (is.null(cn)){ + cn = rep(1,length(self)) + } + circ = ifelse(rep(rep(self$circular,cn),2),'C','L') + snode.id = rep(self$snode.id,cn) + nodepcomp = c(snode.id,lapply(snode.id,function(s){-rev(s)})) + nodestring = lapply(1:length(nodepcomp),function(i){ + paste0(toString(nodepcomp[[i]]),circ[i]) + }) + return(toString(sort(do.call('c',nodestring)))) } ) ) + + +#' @name ==.gWalk +#' @rdname equals.gWalk +#' @title equals.gWalk +#' @description +#' +#' Returns TRUE if two walksets are equivalent, up to their reverse complement. Ignores order of input (to be implemented later!) +#' Walks are said to be equivalent iff their graphs are equivalent and the hashes of their node.ids are equivalent +#' +#' @param x gWalk +#' @param y gWalk +#' @return TRUE if objects are equivalent +#' @export +'==.gWalk' = function(x,y){ + return((x$graph==y$graph) & (x$hash==y$hash)) +} + #' @name c #' @title c #' @description diff --git a/man/equals.gWalk.Rd b/man/equals.gWalk.Rd new file mode 100644 index 0000000..083a6d8 --- /dev/null +++ b/man/equals.gWalk.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gGnome.R +\name{==.gWalk} +\alias{==.gWalk} +\title{equals.gWalk} +\usage{ +\method{==}{gWalk}(x, y) +} +\arguments{ +\item{x}{gWalk} + +\item{y}{gWalk} +} +\value{ +TRUE if objects are equivalent +} +\description{ +Returns TRUE if two walksets are equivalent, up to their reverse complement. Ignores order of input (to be implemented later!) +Walks are said to be equivalent iff their graphs are equivalent and the hashes of their node.ids are equivalent +} From 7b381014516e590f761792af01437dfa0a2788e4 Mon Sep 17 00:00:00 2001 From: ieshghi Date: Fri, 24 Apr 2026 09:20:32 -0400 Subject: [PATCH 2/4] added functions related to hashing (booth_rotate and sort_snodes) in misc_utils.R, and updated hashing function inside gGnome.R. Should handle circular walks correctly now --- R/gGnome.R | 45 +++++++++++++++--------------- R/misc_utils.R | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 22 deletions(-) diff --git a/R/gGnome.R b/R/gGnome.R index 6a48095..85500ef 100644 --- a/R/gGnome.R +++ b/R/gGnome.R @@ -8946,8 +8946,7 @@ gWalk = R6::R6Class("gWalk", ## GWALKS ## drop = drop, ## private$pmeta ## ) - } - ), + } ), active = list( ## Returns a GRangesList of walks in the graph grl = function() @@ -9100,27 +9099,29 @@ gWalk = R6::R6Class("gWalk", ## GWALKS } ix = 1:self$length return(self$dts(ix)) - }, - - - #' @name hash - #' @description - #' hashes the gWalk to a string, by concatenating each walk with its reverse complement, tacking on a flag for circular or linear, then converting to strings and concatenating - hash = function() { - cn = self$dt$cn - if (is.null(cn)){ - cn = rep(1,length(self)) - } - circ = ifelse(rep(rep(self$circular,cn),2),'C','L') - snode.id = rep(self$snode.id,cn) - nodepcomp = c(snode.id,lapply(snode.id,function(s){-rev(s)})) - nodestring = lapply(1:length(nodepcomp),function(i){ - paste0(toString(nodepcomp[[i]]),circ[i]) - }) - return(toString(sort(do.call('c',nodestring)))) + } , + #' @name hash + #' @description + #' hashes the gWalk to a string, by concatenating each walk with its reverse complement, tacking on a flag for circular or linear, then converting to strings and concatenating + hash = function() { + cn = self$dt$cn + if (is.null(cn)){ + cn = rep(1,length(self)) } - ) - ) + snode.id = rep(self$snode.id,cn) + circular = rep(self$circular,cn) + sorted = sort_snodes(snode.id,circular) + snode.id = sorted$nodelist + circular = sorted$arr + circ = ifelse(rep(circular,2),'C','L') + nodepcomp = c(snode.id,lapply(snode.id,function(s){-rev(s)})) + nodestring = lapply(1:length(nodepcomp),function(i){ + return(paste0(toString(nodepcomp[[i]]),circ[i])) + }) + return(toString(sort(do.call('c',nodestring)))) + } + ) + ) diff --git a/R/misc_utils.R b/R/misc_utils.R index 0d5ad41..72a0044 100644 --- a/R/misc_utils.R +++ b/R/misc_utils.R @@ -728,3 +728,78 @@ dt_na2false = function(dt, these_cols = NULL) { } return(dt) } + + +#' @name booth_rotate +#' @title Use Booth's algorithm to disambiguate a circular vector +#' +#' @param x vector input +#' @return x_rot rotated vector +booth_rotate = function(x) { #an implementation of Booth's algorithm to disambiguate circular walk hashes. See https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation + booth = function(s) { + n = length(s) + if (n == 0) return(1L) + s2 = c(s, s) + i = 1L + j = 2L + k = 0L + while (i <= n && j <= n && k < n) { + a = s2[i + k] + b = s2[j + k] + if (a == b) { + k = k + 1L + } else if (a > b) { + # rotation at i is worse than rotation at j -> skip i's prefix + i = i + k + 1L + if (i <= j) i = j + 1L + k = 0L + } else { + # rotation at j is worse -> skip j's prefix + j = j + k + 1L + if (j <= i) j = i + 1L + k = 0L + } + } + pos = min(i, j) + # ensure returned index is in 1..n (not > n) + if (pos > n) pos = pos - n + return(pos) + } + start = booth(x) + n = length(x) + if (start==1){ + return(x) + }else{ + return(x[c(start:n, 1:(start-1))]) + } +} + +#' @name sort_snodes +#' @title sort a signed nodelist unambiguously, for use in hashing +#' +#' @param nodelist list of signed integers pointing to nodes in some gGraph +#' @param circ boolean vector indicating which of the node vectors is circular +sort_snodes = function(nodelist,circ=NULL) { + if (sum(circ)>0){ + circ_walks = nodelist[circ] + circ_walks_rot = lapply(circ_walks,function(w){ + return(booth_rotate(w)) + }) + nodelist[circ] = circ_walks_rot + } + choose_compl = lapply(nodelist, function(x) { + rc = -rev(x) + if (paste(x, collapse = ",") <= paste(rc, collapse = ",")) { + x + } else { + rc + }}) + ord <- order(sapply(choose_compl, paste, collapse = ",")) + sorted_nodes = choose_compl[ord] + if (!is.null(circ)){ + sorted_circ = circ[ord] + return(list(nodelist=sorted_nodes,circ=sorted_circ)) + }else{ + return(sorted_nodes) + } +} From 20b080199f3eb97bf461dcec17c73c2e3b9859c1 Mon Sep 17 00:00:00 2001 From: ieshghi Date: Fri, 24 Apr 2026 09:43:42 -0400 Subject: [PATCH 3/4] fixed small error inside hashing function --- R/gGnome.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/gGnome.R b/R/gGnome.R index 85500ef..2375a1e 100644 --- a/R/gGnome.R +++ b/R/gGnome.R @@ -9112,7 +9112,7 @@ gWalk = R6::R6Class("gWalk", ## GWALKS circular = rep(self$circular,cn) sorted = sort_snodes(snode.id,circular) snode.id = sorted$nodelist - circular = sorted$arr + circular = sorted$circ circ = ifelse(rep(circular,2),'C','L') nodepcomp = c(snode.id,lapply(snode.id,function(s){-rev(s)})) nodestring = lapply(1:length(nodepcomp),function(i){ From 70e6454b366808e261198fa8b95d7061dba4aa29 Mon Sep 17 00:00:00 2001 From: ieshghi Date: Fri, 24 Apr 2026 10:20:21 -0400 Subject: [PATCH 4/4] fixed a bug where the booth rotation wasnt taking into account the reverse complement symmetry of the walks --- R/misc_utils.R | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/R/misc_utils.R b/R/misc_utils.R index 72a0044..e89d355 100644 --- a/R/misc_utils.R +++ b/R/misc_utils.R @@ -783,17 +783,28 @@ sort_snodes = function(nodelist,circ=NULL) { if (sum(circ)>0){ circ_walks = nodelist[circ] circ_walks_rot = lapply(circ_walks,function(w){ - return(booth_rotate(w)) + r1 = booth_rotate(w) + r2 = booth_rotate(-rev(w)) + if (toString(r1)<=toString(r2)){ + return(r1) + }else{ + return(r2) + } }) nodelist[circ] = circ_walks_rot } - choose_compl = lapply(nodelist, function(x) { - rc = -rev(x) - if (paste(x, collapse = ",") <= paste(rc, collapse = ",")) { - x + choose_compl = lapply(seq_along(nodelist), function(i) { + if (circ[i]){ + return(nodelist[[i]]) } else { - rc - }}) + x = nodelist[[i]] + rc = -rev(x) + if (paste(x, collapse = ",") <= paste(rc, collapse = ",")) { + return(x) + } else { + return(rc) + }} + }) ord <- order(sapply(choose_compl, paste, collapse = ",")) sorted_nodes = choose_compl[ord] if (!is.null(circ)){