Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions code_review_graph/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,8 +802,9 @@ def _node_to_text(node: GraphNode) -> str:
Designed so natural-language queries land on the right node, not just on
the enclosing class. We include the dotted ``Parent.name`` form, the
identifier split into words, an explicit ``"in <Parent>"`` phrase, the
enclosing module directory, and the language. Tested by the
``multi_hop_retrieval`` benchmark — see ``docs/REPRODUCING.md``.
docstring summary when the parser extracted one, the enclosing module
directory, and the language. Tested by the ``multi_hop_retrieval``
benchmark — see ``docs/REPRODUCING.md``.
"""
parts: list[str] = []

Expand Down Expand Up @@ -836,14 +837,22 @@ def _node_to_text(node: GraphNode) -> str:
if node.return_type:
parts.append(f"returns {node.return_type}")

# 7. Module / directory context from the file path — gives queries a
# 7. Docstring / doc comment summary — the author's own description of
# what the node does, and the only part written in the same natural
# language as the queries. Extracted by the parser (first paragraph,
# capped) into extra["docstring"].
docstring = node.extra.get("docstring") if node.extra else None
if docstring:
parts.append(str(docstring))

# 8. Module / directory context from the file path — gives queries a
# term like "routing" or "client" to anchor against.
if node.file_path:
parent_dir = Path(node.file_path).parent.name
if parent_dir and parent_dir not in (".", "src", "lib"):
parts.append(parent_dir)

# 8. Language
# 9. Language
if node.language:
parts.append(node.language)

Expand Down
222 changes: 222 additions & 0 deletions code_review_graph/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,72 @@ def _is_test_function(
return False


# -- Docstring / doc comment helpers ---------------------------------------

# Hard cap on stored docstring length. Docstrings feed semantic-search
# embedding text, where the first sentence or two carry nearly all the
# signal; storing whole multi-page docstrings would bloat the DB and drown
# the name/signature terms.
_MAX_DOCSTRING_CHARS = 400

_PY_STRING_PREFIX_RE = re.compile(r"^[rRbBuUfF]{0,2}")
_LINE_COMMENT_MARKER_RE = re.compile(r"^//[/!]*\s?")


def _strip_python_string_literal(text: str) -> Optional[str]:
"""Strip prefix letters and quotes from a Python string literal.

Returns None for bytes and f-string literals: CPython does not treat
them as docstrings (``__doc__`` stays None), so neither do we.
"""
prefix = _PY_STRING_PREFIX_RE.match(text).group(0)
if "b" in prefix.lower() or "f" in prefix.lower():
return None
text = _PY_STRING_PREFIX_RE.sub("", text)
for quote in ('"""', "'''", '"', "'"):
if text.startswith(quote) and text.endswith(quote) and len(text) >= 2 * len(quote):
return text[len(quote):-len(quote)]
return text


def _strip_block_comment(text: str) -> str:
"""Strip ``/** ... */`` (or ``/*! ... */``) wrappers and leading ``*``."""
if text.startswith(("/**", "/*!")):
text = text[3:]
elif text.startswith("/*"):
text = text[2:]
if text.endswith("*/"):
text = text[:-2]
lines = []
for line in text.splitlines():
stripped = line.lstrip()
if stripped.startswith("*"):
stripped = stripped[1:]
if stripped.startswith(" "):
stripped = stripped[1:]
lines.append(stripped)
return "\n".join(lines)


def _clean_docstring(raw: str) -> str:
"""Reduce a raw docstring to its first paragraph, collapsed and capped.

The first paragraph is the summary by every doc convention (PEP 257,
JSDoc, godoc, rustdoc); later paragraphs document parameters and
details that add noise, not signal, to an embedding.
"""
lines = [line.strip() for line in raw.splitlines()]
while lines and not lines[0]:
lines.pop(0)
paragraph: list[str] = []
for line in lines:
if not line:
break
paragraph.append(line)
collapsed = " ".join(" ".join(paragraph).split())
return collapsed[:_MAX_DOCSTRING_CHARS]


def file_hash(path: Path) -> str:
"""SHA-256 hash of file contents."""
return hashlib.sha256(path.read_bytes()).hexdigest()
Expand Down Expand Up @@ -4240,6 +4306,154 @@ def _emit_kafka_edges_from_method(
extra={"kafka_type": ann_name},
))

# -- Docstring / doc comment extraction ------------------------------

_DOC_COMMENT_NODE_TYPES = frozenset({
"comment", "block_comment", "line_comment", "doc_comment",
})
# Sibling node types allowed between a definition and its doc comment
# (attributes and decorators sit there in Rust / JS / TS).
_DOC_COMMENT_SKIP_TYPES = frozenset({
"attribute_item", "decorator",
})

def _get_docstring(self, node, language: str) -> Optional[str]:
"""Extract the documentation summary for a definition node.

Python reads the real docstring (first string expression of the
body). Other languages read the doc comment block directly above
the definition: ``/** ... */`` (JSDoc / Javadoc / Doxygen),
``///`` / ``//!`` doc lines (C#, Swift, Doxygen), or — Go only,
matching godoc — a contiguous plain ``//`` block. Plain ``//``
comments in other languages are ignored as noise. Rust inner doc
comments (``//!``, ``/*! ... */``) document the enclosing
module/crate, so they are never attached to the following item.

Returns the first paragraph, whitespace-collapsed and capped at
``_MAX_DOCSTRING_CHARS``, or None.
"""
if language == "python":
raw = self._python_docstring(node)
else:
raw = self._preceding_doc_comment(node, language)
if not raw:
return None
return _clean_docstring(raw) or None

def _python_docstring(self, node) -> Optional[str]:
"""Return the raw docstring of a Python function/class body."""
body = node.child_by_field_name("body")
if body is None:
return None
for stmt in body.children:
if stmt.type == "comment":
continue
# Only the first real statement can be the docstring.
return self._python_string_expr(stmt)
return None

def _python_string_expr(self, expr) -> Optional[str]:
"""Resolve a statement node to its docstring text, or None.

Depending on the grammar version the docstring is a bare
``string`` child of the block or wrapped in an
``expression_statement``. CPython also accepts parenthesized and
implicitly concatenated literals as docstrings, so both are
unwrapped here.
"""
while expr is not None and expr.type in (
"expression_statement", "parenthesized_expression",
):
inner = [
c for c in expr.children
if c.type not in ("(", ")", "comment")
]
if len(inner) != 1:
return None
expr = inner[0]
if expr is None:
return None
if expr.type == "string":
text = expr.text.decode("utf-8", errors="replace")
return _strip_python_string_literal(text)
if expr.type == "concatenated_string":
parts = []
for piece in expr.children:
if piece.type != "string":
continue
stripped = _strip_python_string_literal(
piece.text.decode("utf-8", errors="replace")
)
if stripped is None:
return None # a bytes/f-string piece disqualifies
parts.append(stripped)
return "".join(parts) if parts else None
return None

def _preceding_doc_comment(self, node, language: str) -> Optional[str]:
"""Return the raw doc comment block directly above ``node``.

Walks preceding siblings, skipping decorators/attributes, and
collects comments that are line-adjacent (no blank line between
the comment block and the definition).
"""
# Doc comments for exported declarations sit above the export
# statement, not the inner declaration node.
while node.parent is not None and node.parent.type == "export_statement":
node = node.parent

comments: list = []
current_line = node.start_point[0]
sib = node.prev_sibling
while sib is not None:
if sib.end_point[0] < current_line - 1:
break # blank line separates — not this definition's doc
if sib.type in self._DOC_COMMENT_SKIP_TYPES:
current_line = sib.start_point[0]
sib = sib.prev_sibling
continue
if sib.type in self._DOC_COMMENT_NODE_TYPES:
comments.append(sib)
current_line = sib.start_point[0]
sib = sib.prev_sibling
continue
break
if not comments:
return None
comments.reverse()

texts = [
c.text.decode("utf-8", errors="replace").strip() for c in comments
]
# Block comment: use the nearest one alone; require doc style
# (/** or /*!) so ordinary block comments aren't hoovered up.
last = texts[-1]
if last.startswith("/*"):
# Rust's ``/*! ... */`` is an *inner* doc comment: it documents
# the enclosing module/crate, never the following item.
doc_markers = ("/**",) if language == "rust" else ("/**", "/*!")
if last.startswith(doc_markers):
return _strip_block_comment(last)
return None
# Line comments: godoc reads plain ``//`` blocks; everything else
# must use explicit doc markers (/// or //!) on every line.
if all(t.startswith("//") for t in texts):
if language == "rust":
# ``//!`` is an inner doc comment in Rust (see above) —
# drop leading module docs, keep only ``///`` item docs.
while texts and texts[0].startswith("//!"):
texts.pop(0)
if not texts or not all(t.startswith("///") for t in texts):
return None
elif language != "go" and not all(
t.startswith(("///", "//!")) for t in texts
):
return None
return "\n".join(
_LINE_COMMENT_MARKER_RE.sub("", t) for t in texts
)
return None

def _extract_classes(
self,
child,
Expand Down Expand Up @@ -4296,6 +4510,10 @@ def _extract_classes(
role = "workflow_interface" if is_wf else "activity_interface"
extra["temporal_role"] = role

docstring = self._get_docstring(child, language)
if docstring:
extra["docstring"] = docstring

node = NodeInfo(
kind="Class",
name=name,
Expand Down Expand Up @@ -4445,6 +4663,10 @@ def _extract_functions(
child, name, enclosing_class, file_path, edges,
)

docstring = self._get_docstring(child, language)
if docstring:
method_extra["docstring"] = docstring

node = NodeInfo(
kind=kind,
name=name,
Expand Down
12 changes: 12 additions & 0 deletions tests/test_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,18 @@ def test_file_node_no_kind(self):
# File kind should not add "file" as a kind label
assert "file.py" in text

def test_docstring_included(self):
node = self._make_node(
extra={"docstring": "Parses PDF rate sheets into rows."},
)
text = _node_to_text(node)
assert "Parses PDF rate sheets into rows." in text

def test_docstring_absent_text_unchanged(self):
plain = _node_to_text(self._make_node())
empty_extra = _node_to_text(self._make_node(extra={}))
assert plain == empty_extra


class TestEmbeddingStore:
def test_store_initializes(self, tmp_path):
Expand Down
Loading