Add index and query support for fast indexing and querying of documents

This commit is contained in:
James Mills
2025-10-04 00:01:53 +10:00
parent 67cf3b2e40
commit 0dd77ffa85
2 changed files with 431 additions and 0 deletions

View File

@@ -20,6 +20,7 @@ Table of Contents:
* [RSS](#rss)
* [Hooks](#hooks)
* [Routes](#routes)
* [Index](#index)
* [Command line usage](#command-line-usage)
* [zs Users](#zs-users)
* [Frequently Asked Questions](#frequently-asked-questions)
@@ -292,6 +293,48 @@ This serves files from `/assets/docs/` while keeping the URL under `/docs/`.
⚡️ Useful for migrations, vanity URLs, or serving legacy paths without a reverse proxy.
## Index
`zs` supports indexing documents and providing a query command for quickly
retrieving documents or metadata on documents quickly and easily without having
to spend time walking directory trees and parses files over and over again.
### Building the index
```
zs index -r . -i "posts/*.md" -i "pages/*.md" -e ".cache/*" -o .cache/zs/index.json
```
Flags:
- `-r, --root` Root directory to index (default `.`).
- `-i, --include` Glob include patterns (repeatable). If omitted, all parsable content files are considered.
- `-e, --exclude` Glob exclude patterns (repeatable).
- `-o, --out` Output index path (default `.cache/zs/index.json`).
### Querying the index
- Extract variables from a document (front matter + defaults):
```
zs query vars posts/2025-01-02-hello.md
```
- Find neighbors (prev/next in chronological order within the same directory) using the on-disk index:
```
zs query neighbors posts/2025-01-02-hello.md -o .cache/zs/index.json
```
- List posts by tag / year / month:
```
zs query list --tag go --year 2025 --month 9
```
The on-disk index is a single JSON file containing an array of records with `path`, `url`, `title`, `date`, `vars`, `tags`, and `neighbors`.
---
## Command line usage
- `zs build` re-builds your site.

388
index_query.go Normal file
View File

@@ -0,0 +1,388 @@
package main
import (
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
gitIgnore "github.com/sabhiram/go-gitignore"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)
const defaultIndexFile = ".cache.json"
// IndexRecord captures the minimal data we want to query quickly.
type IndexRecord struct {
Path string `json:"path"` // e.g. posts/2025-01-02-hello.md
URL string `json:"url"` // computed url (without leading /)
Title string `json:"title"` // from front matter or fallback
Date time.Time `json:"date"` // from vars.date | filename | mtime
Vars map[string]string `json:"vars"` // all vars for convenience
Tags []string `json:"tags"` // split of vars["tags"] on commas
Neighbors struct {
Prev *string `json:"prev,omitempty"`
Next *string `json:"next,omitempty"`
} `json:"neighbors"`
}
// IndexFile is the materialized index saved on disk.
type IndexFile struct {
Generated time.Time `json:"generated"`
Records []IndexRecord `json:"records"`
ByPath map[string]int `json:"-"` // not serialized; built at load time
}
// parseDate attempts a few common locations: front matter "date", filename, mtime.
func parseDate(path string, vars Vars, info os.FileInfo) time.Time {
// front matter "date"
if v, ok := vars["date"]; ok && strings.TrimSpace(v) != "" {
// try RFC3339, "2006-01-02", or "2006-01-02 15:04"
layouts := []string{time.RFC3339, "2006-01-02", "2006-01-02 15:04"}
for _, layout := range layouts {
if t, err := time.Parse(layout, v); err == nil {
return t
}
}
}
// filename prefix "YYYY-MM-DD-..."
base := filepath.Base(path)
if len(base) >= 10 {
prefix := base[:10]
if t, err := time.Parse("2006-01-02", prefix); err == nil {
return t
}
}
// fallback to mtime
return info.ModTime()
}
// splitTags converts common comma/space separated tags into list.
func splitTags(tags string) []string {
if tags == "" {
return nil
}
// support "a, b, c" or "a b c"
// prefer commas if present
if strings.Contains(tags, ",") {
parts := strings.Split(tags, ",")
out := make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
out = append(out, p)
}
}
return out
}
fields := strings.Fields(tags)
if len(fields) == 0 {
return nil
}
return fields
}
// ----- `zs index` -----
var indexCmd = &cobra.Command{
Use: "index",
Short: "Index documents in a zs site for fast queries",
Long: "Walk the site root, parse front matter, and build a queryable index for vars, tags, dates, and neighbors.",
RunE: func(_ *cobra.Command, _ []string) error {
// Resolve .zsignore relative to root (do not rely on global Ignore)
var Ignore *gitIgnore.GitIgnore
if obj, err := gitIgnore.CompileIgnoreFile(ZSIGNORE); err == nil {
Ignore = obj
} else {
Ignore = gitIgnore.CompileIgnoreLines(DefaultIgnore)
}
var records []IndexRecord
err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error {
if err != nil {
// log and continue
log.WithError(err).Warn("walk error")
return nil
}
// ignore hidden files and directories and ignored patterns
// except for the default .routes file (redirects, rewrites, etc)
if filepath.Base(path) != ".routes" && (filepath.Base(path)[0] == '.' || strings.HasPrefix(path, ".") || Ignore.MatchesPath(path)) {
return nil
}
// inform user about fs walk errors, but continue iteration
if err != nil {
log.WithError(err).Warn("error walking directory")
return nil
}
if info.IsDir() {
return nil
}
log.Debugf("indexing %s", path)
// Only index files we can parse or are likely content: markdown, html, txt by default
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".md", ".markdown", ".html", ".htm", ".txt":
default:
// we still allow if file starts with front matter
// but to keep index small, skip binaries
return nil
}
vars, _, err := getVars(path, globals())
if err != nil {
log.WithError(err).Warnf("getVars failed for %s", path)
return nil
}
if vars == nil {
// ignored by .zsignore
return nil
}
// Skip drafts globally
if strings.EqualFold(vars["draft"], "true") {
return nil
}
rec := IndexRecord{
Path: filepath.ToSlash(path),
URL: strings.TrimPrefix(vars["url"], "/"),
Title: vars["title"],
Vars: map[string]string{},
}
for k, v := range vars {
rec.Vars[k] = v
}
rec.Tags = splitTags(vars["tags"])
rec.Date = parseDate(path, vars, info)
records = append(records, rec)
return nil
})
if err != nil {
return err
}
// Sort newest first (descending date), tiebreak by path
sort.SliceStable(records, func(i, j int) bool {
if records[i].Date.Equal(records[j].Date) {
return records[i].Path < records[j].Path
}
return records[i].Date.After(records[j].Date)
})
// Build neighbors in the "posts/" namespace if present, otherwise global
var idxByPath = map[string]int{}
for i := range records {
idxByPath[records[i].Path] = i
}
// group by dir (so neighbors stay within directory like posts/)
groups := map[string][]int{}
for i, r := range records {
dir := filepath.Dir(r.Path)
groups[dir] = append(groups[dir], i)
}
for _, arr := range groups {
for j, i := range arr {
var prev, next *string
if j+1 < len(arr) {
p := records[arr[j+1]].Path
prev = &p
}
if j-1 >= 0 {
n := records[arr[j-1]].Path
next = &n
}
records[i].Neighbors.Prev = prev
records[i].Neighbors.Next = next
}
}
index := IndexFile{
Generated: time.Now(),
Records: records,
}
f, err := os.Create(defaultIndexFile)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
if err := enc.Encode(&index); err != nil {
return err
}
fmt.Fprintf(os.Stdout, "%d records written to %s\n", len(records), defaultIndexFile)
return nil
},
}
func loadIndex(path string) (*IndexFile, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
var idx IndexFile
if err := json.Unmarshal(b, &idx); err != nil {
return nil, err
}
idx.ByPath = make(map[string]int, len(idx.Records))
for i, r := range idx.Records {
idx.ByPath[r.Path] = i
}
return &idx, nil
}
// ----- `zs query` -----
var queryCmd = &cobra.Command{
Use: "query",
Short: "Query site metadata and the on-disk index",
}
var queryVarsCmd = &cobra.Command{
Use: "vars <path>",
Short: "Extract variables from a document",
Args: cobra.ExactArgs(1),
RunE: func(_ *cobra.Command, args []string) error {
path := args[0]
// fast path: index
if idx, err := loadIndex(defaultIndexFile); err == nil {
key := filepath.ToSlash(path)
if i, ok := idx.ByPath[key]; ok {
v := idx.Records[i].Vars
// ensure url/title present from record
if v["url"] == "" {
v["url"] = idx.Records[i].URL
}
if v["title"] == "" {
v["title"] = idx.Records[i].Title
}
json.NewEncoder(os.Stdout).Encode(v)
return nil
}
}
vars, _, err := getVars(path, globals())
if err != nil {
return err
}
if vars == nil {
return errors.New("no variables (possibly ignored by .zsignore)")
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(vars)
},
}
var queryNeighborsCmd = &cobra.Command{
Use: "neighbors <path>",
Short: "List neighbors (prev/next) of a document using the index",
Args: cobra.ExactArgs(1),
RunE: func(_ *cobra.Command, args []string) error {
idx, err := loadIndex(defaultIndexFile)
if err != nil {
return fmt.Errorf("load index %s: %w", defaultIndexFile, err)
}
key := filepath.ToSlash(args[0])
i, ok := idx.ByPath[key]
if !ok {
return fmt.Errorf("path %q not found in index", key)
}
type Neighbor struct {
Path string `json:"path"`
URL string `json:"url"`
Title string `json:"title"`
}
var out struct {
Prev *Neighbor `json:"prev,omitempty"`
Next *Neighbor `json:"next,omitempty"`
}
rec := idx.Records[i]
if rec.Neighbors.Prev != nil {
p := *rec.Neighbors.Prev
pi := idx.ByPath[p]
r := idx.Records[pi]
out.Prev = &Neighbor{Path: r.Path, URL: r.URL, Title: r.Title}
}
if rec.Neighbors.Next != nil {
n := *rec.Neighbors.Next
ni := idx.ByPath[n]
r := idx.Records[ni]
out.Next = &Neighbor{Path: r.Path, URL: r.URL, Title: r.Title}
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(out)
},
}
var queryListCmd = &cobra.Command{
Use: "list",
Short: "List documents from the index (filter by tag/year/month)",
RunE: func(cmd *cobra.Command, _ []string) error {
idx, err := loadIndex(defaultIndexFile)
if err != nil {
return err
}
tag, _ := cmd.Flags().GetString("tag")
year, _ := cmd.Flags().GetInt("year")
month, _ := cmd.Flags().GetInt("month")
var out []IndexRecord
for _, r := range idx.Records {
if tag != "" {
found := false
for _, t := range r.Tags {
if strings.EqualFold(t, tag) {
found = true
break
}
}
if !found {
continue
}
}
if year != 0 && r.Date.Year() != year {
continue
}
if month != 0 && int(r.Date.Month()) != month {
continue
}
out = append(out, r)
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(out)
},
}
func init() {
// Sub-commands
queryCmd.AddCommand(queryVarsCmd)
queryCmd.AddCommand(queryNeighborsCmd)
queryListCmd.Flags().String("tag", "", "filter by tag")
queryListCmd.Flags().Int("year", 0, "filter by year (YYYY)")
queryListCmd.Flags().Int("month", 0, "filter by month (1-12)")
queryCmd.AddCommand(queryListCmd)
// Wire into root in their own init() file scope
RootCmd.AddCommand(indexCmd)
RootCmd.AddCommand(queryCmd)
}