Skip to content

Commit 780e36a

Browse files
authored
feat: add directory parsing support to OpenViking (#194)
* feat: add directory parsing support to OpenViking - Implemented DirectoryParser to handle local directories with mixed document types. - Enhanced add_resource function to support directory imports with options for including, excluding, and ignoring specific directories. - Updated client and service layers to forward additional parsing options. - Added unit tests for DirectoryParser to ensure correct functionality and error handling. - Improved user feedback with rich table summaries for processed, failed, unsupported, and skipped files during directory imports. * docs: update README.md to include directory import instructions for add.py * style: reformat files to pass CI code formatting * style: reformat files to pass CI code formatting
1 parent b9508fe commit 780e36a

File tree

12 files changed

+1489
-18
lines changed

12 files changed

+1489
-18
lines changed

examples/query/README.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,35 @@ uv run query.py "What do we have here?" --score-threshold 0.5
2020
mv data/ data.bak/ # or rm -rf if you want
2121
```
2222

23+
## Add Directory
24+
25+
`add.py` supports adding an entire directory of documents at once. Files are automatically classified and parsed by their type (PDF, Markdown, Text, code, etc.). A summary table is printed after import showing which files were processed, failed, unsupported, or filtered.
26+
27+
```bash
28+
# Add all supported files in a directory
29+
uv run add.py ~/Documents/research/
30+
31+
# Only include specific file types
32+
uv run add.py ~/project/ --include '*.md' --include '*.pdf'
33+
34+
# Exclude certain files
35+
uv run add.py ~/project/ --exclude 'test_*' --exclude '*.pyc'
36+
37+
# Skip specific sub-directories
38+
uv run add.py ~/project/ --ignore-dirs node_modules --ignore-dirs .git
39+
40+
# Combine options
41+
uv run add.py ~/project/ --include '*.md' --exclude 'draft_*' --ignore-dirs vendor
42+
```
43+
44+
### Directory Options
45+
46+
| Option | Description |
47+
|--------|-------------|
48+
| `--include PATTERN` | Glob pattern for files to include (can be repeated) |
49+
| `--exclude PATTERN` | Glob pattern for files to exclude (can be repeated) |
50+
| `--ignore-dirs NAME` | Directory names to skip (can be repeated) |
51+
2352
### Query Options
2453

2554
| Option | Default | Description |
@@ -50,7 +79,7 @@ Edit `ov.conf` to configure:
5079

5180
```
5281
rag.py # RAG pipeline library
53-
add.py # Add documents CLI
82+
add.py # Add documents/directories CLI
5483
query.py # Query CLI
5584
q # Quick query wrapper
5685
logging_config.py # Logging configuration
@@ -64,3 +93,4 @@ data/ # Database storage
6493
- Use `uv run query.py` for more control
6594
- Set `OV_DEBUG=1` only when debugging
6695
- Resources are indexed once, query unlimited times
96+
- When adding directories, use `--include` / `--exclude` to control which files are imported

examples/query/add.py

Lines changed: 157 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,116 @@
77
import json
88
import sys
99
from pathlib import Path
10+
from typing import Any, Dict, List
11+
12+
from rich import box
13+
from rich.console import Console
14+
from rich.table import Table
1015

1116
import openviking as ov
1217
from openviking_cli.utils.config.open_viking_config import OpenVikingConfig
1318

19+
console = Console()
20+
21+
22+
# ── Table helpers ──────────────────────────────────────────────────
23+
24+
25+
def _print_directory_summary(meta: Dict[str, Any], errors: List[str]) -> None:
26+
"""Print a rich-table summary for a directory import."""
27+
processed: List[Dict[str, str]] = meta.get("processed_files", [])
28+
failed: List[Dict[str, str]] = meta.get("failed_files", [])
29+
unsupported: List[Dict[str, str]] = meta.get("unsupported_files", [])
30+
skipped: List[Dict[str, str]] = meta.get("skipped_files", [])
1431

15-
def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path: str = "./data"):
32+
n_total = len(processed) + len(failed) + len(unsupported) + len(skipped)
33+
34+
if n_total == 0:
35+
console.print(" (no files found)", style="dim")
36+
return
37+
38+
# Build a single combined table (ROUNDED box style, same as query.py)
39+
table = Table(
40+
title=f"Directory Import ({n_total} files)",
41+
box=box.ROUNDED,
42+
show_header=True,
43+
header_style="bold magenta",
44+
title_style="bold magenta",
45+
)
46+
table.add_column("#", style="cyan", width=4)
47+
table.add_column("Status", no_wrap=True)
48+
table.add_column("File", style="bold white", no_wrap=True)
49+
table.add_column("Detail")
50+
51+
# Match failed files to their warning messages
52+
fail_reasons: Dict[str, str] = {}
53+
for err in errors:
54+
for f in failed:
55+
if f["path"] in err:
56+
fail_reasons[f["path"]] = err
57+
break
58+
59+
idx = 0
60+
for f in processed:
61+
idx += 1
62+
table.add_row(
63+
str(idx),
64+
"[green]processed[/green]",
65+
f["path"],
66+
f"[dim]{f.get('parser', '')}[/dim]",
67+
)
68+
69+
for f in failed:
70+
idx += 1
71+
reason = fail_reasons.get(f["path"], "parse error")
72+
table.add_row(
73+
str(idx),
74+
"[red]failed[/red]",
75+
f["path"],
76+
f"[red]{reason}[/red]",
77+
)
78+
79+
for f in unsupported:
80+
idx += 1
81+
table.add_row(
82+
str(idx),
83+
"[yellow]unsupported[/yellow]",
84+
f["path"],
85+
"",
86+
)
87+
88+
for f in skipped:
89+
idx += 1
90+
status = f.get("status", "skip")
91+
table.add_row(
92+
str(idx),
93+
f"[dim]{status}[/dim]",
94+
f"[dim]{f['path']}[/dim]",
95+
"",
96+
)
97+
98+
console.print()
99+
console.print(table)
100+
101+
102+
# ── Main logic ─────────────────────────────────────────────────────
103+
104+
105+
def add_resource(
106+
resource_path: str,
107+
config_path: str = "./ov.conf",
108+
data_path: str = "./data",
109+
**kwargs,
110+
):
16111
"""
17112
Add a resource to OpenViking database
18113
19114
Args:
20115
resource_path: Path to file, directory, or URL
21116
config_path: Path to config file
22117
data_path: Path to data directory
118+
**kwargs: Extra options forwarded to ``add_resource`` (e.g.
119+
``include``, ``exclude``, ``ignore_dirs``).
23120
"""
24121
# Load config
25122
print(f"📋 Loading config from: {config_path}")
@@ -34,25 +131,37 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path:
34131
client.initialize()
35132
print("✓ Initialized\n")
36133

37-
print(f"📂 Adding resource: {resource_path}")
38-
39-
# Check if it's a file and exists
40-
if not resource_path.startswith("http"):
134+
# Check if it's a local path and exists
135+
is_local = not resource_path.startswith("http")
136+
is_directory = False
137+
if is_local:
41138
path = Path(resource_path).expanduser()
42139
if not path.exists():
43-
print(f"❌ Error: File not found: {path}")
140+
print(f"❌ Error: Path not found: {path}")
44141
return False
142+
is_directory = path.is_dir()
45143

46-
result = client.add_resource(path=resource_path)
144+
if is_directory:
145+
print(f"📂 Adding directory: {resource_path}")
146+
else:
147+
print(f"📄 Adding resource: {resource_path}")
148+
149+
result = client.add_resource(path=resource_path, **kwargs)
47150

48151
# Check result
49152
if result and "root_uri" in result:
50153
root_uri = result["root_uri"]
51-
print(f"✓ Resource added: {root_uri}\n")
154+
meta = result.get("meta", {})
155+
errors = result.get("errors", [])
156+
print(f"✓ Resource added: {root_uri}")
157+
158+
# Show directory-specific table
159+
if is_directory:
160+
_print_directory_summary(meta, errors)
52161

53162
# Wait for processing
54-
print("⏳ Processing and indexing...")
55-
client.wait_processed(timeout=300)
163+
print("\n⏳ Processing and indexing...")
164+
client.wait_processed(timeout=600 if is_directory else 300)
56165
print("✓ Processing complete!\n")
57166

58167
print("🎉 Resource is now searchable in the database!")
@@ -61,7 +170,7 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path:
61170
elif result and result.get("status") == "error":
62171
print("\n⚠️ Resource had parsing issues:")
63172
if "errors" in result:
64-
for error in result["errors"][:3]:
173+
for error in result["errors"][:5]:
65174
print(f" - {error}")
66175
print("\n💡 Some content may still be searchable.")
67176
return False
@@ -123,6 +232,31 @@ def main():
123232
"--data", type=str, default="./data", help="Path to data directory (default: ./data)"
124233
)
125234

235+
# Directory-specific options
236+
dir_group = parser.add_argument_group("directory options")
237+
dir_group.add_argument(
238+
"--include",
239+
type=str,
240+
action="append",
241+
default=None,
242+
help="Glob pattern for files to include (can be repeated, e.g. --include '*.md')",
243+
)
244+
dir_group.add_argument(
245+
"--exclude",
246+
type=str,
247+
action="append",
248+
default=None,
249+
help="Glob pattern for files to exclude (can be repeated, e.g. --exclude 'test_*')",
250+
)
251+
dir_group.add_argument(
252+
"--ignore-dirs",
253+
type=str,
254+
action="append",
255+
default=None,
256+
dest="ignore_dirs",
257+
help="Directory names to skip (can be repeated, e.g. --ignore-dirs node_modules)",
258+
)
259+
126260
args = parser.parse_args()
127261

128262
# Expand user paths
@@ -132,8 +266,19 @@ def main():
132266
else args.resource
133267
)
134268

269+
# Build kwargs for directory options
270+
# scan_directory expects include/exclude as comma-separated strings,
271+
# and ignore_dirs as a Set[str].
272+
dir_kwargs = {}
273+
if args.include:
274+
dir_kwargs["include"] = ",".join(args.include)
275+
if args.exclude:
276+
dir_kwargs["exclude"] = ",".join(args.exclude)
277+
if args.ignore_dirs:
278+
dir_kwargs["ignore_dirs"] = set(args.ignore_dirs)
279+
135280
# Add the resource
136-
success = add_resource(resource_path, args.config, args.data)
281+
success = add_resource(resource_path, args.config, args.data, **dir_kwargs)
137282

138283
sys.exit(0 if success else 1)
139284

openviking/async_client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,15 @@ async def add_resource(
143143
instruction: str = "",
144144
wait: bool = False,
145145
timeout: float = None,
146+
**kwargs,
146147
) -> Dict[str, Any]:
147148
"""Add resource to OpenViking (only supports resources scope).
148149
149150
Args:
150151
wait: Whether to wait for semantic extraction and vectorization to complete
151152
timeout: Wait timeout in seconds
153+
**kwargs: Extra options forwarded to the parser chain, e.g.
154+
``strict``, ``ignore_dirs``, ``include``, ``exclude``.
152155
"""
153156
await self._ensure_initialized()
154157
return await self._client.add_resource(
@@ -158,6 +161,7 @@ async def add_resource(
158161
instruction=instruction,
159162
wait=wait,
160163
timeout=timeout,
164+
**kwargs,
161165
)
162166

163167
async def wait_processed(self, timeout: float = None) -> Dict[str, Any]:

openviking/client/local.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ async def add_resource(
5858
instruction: str = "",
5959
wait: bool = False,
6060
timeout: Optional[float] = None,
61+
**kwargs,
6162
) -> Dict[str, Any]:
6263
"""Add resource to OpenViking."""
6364
return await self._service.resources.add_resource(
@@ -67,6 +68,7 @@ async def add_resource(
6768
instruction=instruction,
6869
wait=wait,
6970
timeout=timeout,
71+
**kwargs,
7072
)
7173

7274
async def add_skill(

0 commit comments

Comments
 (0)