Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openviking/parse/parsers/code/ast/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
".hpp": "cpp",
".rs": "rust",
".go": "go",
".cs": "csharp",
}

# Language key → (module path, class name, constructor kwargs)
Expand All @@ -38,6 +39,7 @@
"cpp": ("openviking.parse.parsers.code.ast.languages.cpp", "CppExtractor", {}),
"rust": ("openviking.parse.parsers.code.ast.languages.rust", "RustExtractor", {}),
"go": ("openviking.parse.parsers.code.ast.languages.go", "GoExtractor", {}),
"csharp": ("openviking.parse.parsers.code.ast.languages.csharp", "CSharpExtractor", {}),
}


Expand Down
189 changes: 189 additions & 0 deletions openviking/parse/parsers/code/ast/languages/csharp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: Apache-2.0
"""C# AST extractor using tree-sitter-c-sharp."""

import re
from typing import List

from openviking.parse.parsers.code.ast.languages.base import LanguageExtractor
from openviking.parse.parsers.code.ast.skeleton import ClassSkeleton, CodeSkeleton, FunctionSig


def _node_text(node, content_bytes: bytes) -> str:
return content_bytes[node.start_byte : node.end_byte].decode("utf-8", errors="replace")


def _parse_doc_comment(raw: str) -> str:
"""Strip XML doc comment markers (/// or /** */) and extract text from XML tags."""
raw = raw.strip()
if raw.startswith("///"):
lines = raw.split("\n")
cleaned = []
for line in lines:
stripped = line.strip()
if stripped.startswith("///"):
stripped = stripped[3:].strip()
if stripped:
cleaned.append(stripped)
raw = " ".join(cleaned)
elif raw.startswith("/**"):
raw = raw[3:]
if raw.endswith("*/"):
raw = raw[:-2]
lines = [l.strip().lstrip("*").strip() for l in raw.split("\n")]
raw = "\n".join(l for l in lines if l).strip()
# Remove XML tags
raw = re.sub(r"</?[a-zA-Z][a-zA-Z0-9]*(?:\s+[^>]*)?/?>", "", raw)
# Normalize whitespace
raw = re.sub(r"\s+", " ", raw).strip()
return raw


def _preceding_doc(siblings: list, idx: int, content_bytes: bytes) -> str:
"""Return XML doc comment immediately before siblings[idx], or ''."""
if idx == 0:
return ""
comments = []
for i in range(idx - 1, -1, -1):
prev = siblings[i]
if prev.type == "comment":
text = _node_text(prev, content_bytes)
if text.strip().startswith("///") or text.strip().startswith("/**"):
comments.insert(0, _parse_doc_comment(text))
else:
break
elif prev.type in ("preprocessor_directive", "nullable_directive"):
continue
else:
break
return "\n".join(comments) if comments else ""


def _extract_method(node, content_bytes: bytes, docstring: str = "") -> FunctionSig:
name = ""
params = ""
return_type = ""

for child in node.children:
if child.type == "identifier" and not name:
name = _node_text(child, content_bytes)
elif child.type == "void_keyword":
return_type = "void"
elif child.type in ("predefined_type", "type_identifier", "generic_name"):
if not return_type:
return_type = _node_text(child, content_bytes)
elif child.type == "parameter_list":
raw = _node_text(child, content_bytes).strip()
if raw.startswith("(") and raw.endswith(")"):
raw = raw[1:-1]
params = raw.strip()

if node.type == "property_declaration":
for child in node.children:
if child.type == "accessor_list":
accessors = []
for acc in child.children:
if acc.type == "accessor_declaration":
accessor_name = ""
name_node = acc.child_by_field_name("name")
if name_node is not None:
accessor_name = _node_text(name_node, content_bytes).strip()
else:
for sub in acc.children:
if sub.type in ("get", "set", "init"):
accessor_name = sub.type
break
if accessor_name in ("get", "set", "init"):
accessors.append(accessor_name)
if accessors:
params = f"{{ {' '.join(accessors)} }}"

return FunctionSig(name=name, params=params, return_type=return_type, docstring=docstring)


def _extract_class(node, content_bytes: bytes, docstring: str = "") -> ClassSkeleton:
name = ""
bases: List[str] = []
body_node = None

for child in node.children:
if child.type == "identifier" and not name:
name = _node_text(child, content_bytes)
elif child.type == "base_list":
for sub in child.children:
if sub.type in ("type_identifier", "identifier"):
bases.append(_node_text(sub, content_bytes))
elif child.type == "declaration_list":
body_node = child

methods: List[FunctionSig] = []
if body_node:
siblings = list(body_node.children)
for idx, child in enumerate(siblings):
if child.type in ("method_declaration", "constructor_declaration"):
doc = _preceding_doc(siblings, idx, content_bytes)
methods.append(_extract_method(child, content_bytes, docstring=doc))
elif child.type == "property_declaration":
doc = _preceding_doc(siblings, idx, content_bytes)
methods.append(_extract_method(child, content_bytes, docstring=doc))

return ClassSkeleton(name=name, bases=bases, docstring=docstring, methods=methods)


class CSharpExtractor(LanguageExtractor):
def __init__(self):
import tree_sitter_c_sharp as tscsharp
from tree_sitter import Language, Parser

self._language = Language(tscsharp.language())
self._parser = Parser(self._language)

def extract(self, file_name: str, content: str) -> CodeSkeleton:
content_bytes = content.encode("utf-8")
tree = self._parser.parse(content_bytes)
root = tree.root_node

imports: List[str] = []
classes: List[ClassSkeleton] = []
functions: List[FunctionSig] = []

siblings = list(root.children)
for idx, child in enumerate(siblings):
if child.type == "using_directive":
for sub in child.children:
if sub.type == "identifier":
imports.append(_node_text(sub, content_bytes))
elif sub.type == "qualified_name":
imports.append(_node_text(sub, content_bytes))
elif child.type in ("namespace_declaration", "file_scoped_namespace_declaration"):
for sub in child.children:
if sub.type == "declaration_list":
ns_siblings = list(sub.children)
for ns_idx, ns_child in enumerate(ns_siblings):
if ns_child.type in (
"class_declaration",
"interface_declaration",
"struct_declaration",
"record_declaration",
):
doc = _preceding_doc(ns_siblings, ns_idx, content_bytes)
classes.append(
_extract_class(ns_child, content_bytes, docstring=doc)
)
elif child.type in (
"class_declaration",
"interface_declaration",
"struct_declaration",
"record_declaration",
):
doc = _preceding_doc(siblings, idx, content_bytes)
classes.append(_extract_class(child, content_bytes, docstring=doc))

return CodeSkeleton(
file_name=file_name,
language="C#",
module_doc="",
imports=imports,
classes=classes,
functions=functions,
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ dependencies = [
"tree-sitter-cpp>=0.23.0",
"tree-sitter-rust>=0.23.0",
"tree-sitter-go>=0.23.0",
"tree-sitter-c-sharp>=0.23.0",
]

[tool.uv.sources]
Expand Down
135 changes: 135 additions & 0 deletions tests/parse/test_ast_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def _ts_extractor():
return JsTsExtractor(lang="typescript")


def _csharp_extractor():
from openviking.parse.parsers.code.ast.languages.csharp import CSharpExtractor
return CSharpExtractor()



# ---------------------------------------------------------------------------
# Python
Expand Down Expand Up @@ -452,6 +457,130 @@ def test_to_text_verbose(self):
assert "@return sum of a and b" in text


# ---------------------------------------------------------------------------
# C#
# ---------------------------------------------------------------------------

class TestCSharpExtractor:
SAMPLE = """
using System;
using System.Collections.Generic;

namespace MyApp.Services
{
/// <summary>
/// A simple calculator service.
///
/// Supports basic arithmetic operations.
/// </summary>
public class Calculator
{
/// <summary>
/// Add two integers.
///
/// <param name=\"a\">First operand</param>
/// <param name=\"b\">Second operand</param>
/// <returns>Sum of a and b</returns>
/// </summary>
public int Add(int a, int b)
{
return a + b;
}

/// <summary>
/// Subtract b from a.
/// </summary>
public int Subtract(int a, int b)
{
return a - b;
}
}
}
"""

def setup_method(self):
self.e = _csharp_extractor()

def test_imports(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
assert "System" in sk.imports
assert "System.Collections.Generic" in sk.imports

def test_class_extracted(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
names = {c.name for c in sk.classes}
assert "Calculator" in names

def test_class_docstring(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
cls = next(c for c in sk.classes if c.name == "Calculator")
assert "simple calculator service" in cls.docstring
assert "Supports basic arithmetic" in cls.docstring

def test_methods_extracted(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
cls = next(c for c in sk.classes if c.name == "Calculator")
methods = {m.name: m for m in cls.methods}
assert "Add" in methods
assert "Subtract" in methods

def test_method_docstring(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
cls = next(c for c in sk.classes if c.name == "Calculator")
methods = {m.name: m for m in cls.methods}
assert "Add two integers." in methods["Add"].docstring
assert "First operand" in methods["Add"].docstring

def test_to_text_compact(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
text = sk.to_text(verbose=False)
assert "# Calculator.cs [C#]" in text
assert "class Calculator" in text
assert "+ Add(" in text
assert "First operand" not in text

def test_to_text_verbose(self):
sk = self.e.extract("Calculator.cs", self.SAMPLE)
text = sk.to_text(verbose=True)
assert "simple calculator service" in text
assert "First operand" in text

def test_file_scoped_namespace(self):
code = '''
using System;

namespace MyApp.Services;

public class Calculator
{
public int Add(int a, int b)
{
return a + b;
}
}
'''
sk = self.e.extract("Calculator.cs", code)
names = {c.name for c in sk.classes}
assert "Calculator" in names

def test_property_accessor_signature(self):
code = '''
public class Calculator
{
/// <summary>
/// Current result.
/// </summary>
public int Result { get; set; }
}
'''
sk = self.e.extract("Calculator.cs", code)
cls = next(c for c in sk.classes if c.name == "Calculator")
methods = {m.name: m for m in cls.methods}
assert "Result" in methods
assert "get" in methods["Result"].params
assert "set" in methods["Result"].params


# ---------------------------------------------------------------------------
# C/C++
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -853,6 +982,12 @@ def test_go_dispatch(self):
assert "# main.go [Go]" in text
assert "Run" in text

def test_csharp_dispatch(self):
code = 'namespace Demo;\n\npublic class Util { public int Add(int a, int b) { return a + b; } }\n'
text = self.extractor.extract_skeleton("util.cs", code)
assert "# util.cs [C#]" in text
assert "class Util" in text

def test_unknown_extension_returns_none(self):
code = "def foo(x): pass\nclass Bar: pass\n"
result = self.extractor.extract_skeleton("script.lua", code)
Expand Down