mirror of
https://git.mills.io/prologic/zs.git
synced 2026-06-12 19:59:19 -04:00
Add index and query support for fast indexing and querying of documents
This commit is contained in:
43
README.md
43
README.md
@@ -20,6 +20,7 @@ Table of Contents:
|
||||
* [RSS](#rss)
|
||||
* [Hooks](#hooks)
|
||||
* [Routes](#routes)
|
||||
* [Index](#index)
|
||||
* [Command line usage](#command-line-usage)
|
||||
* [zs Users](#zs-users)
|
||||
* [Frequently Asked Questions](#frequently-asked-questions)
|
||||
@@ -292,6 +293,48 @@ This serves files from `/assets/docs/` while keeping the URL under `/docs/`.
|
||||
|
||||
⚡️ Useful for migrations, vanity URLs, or serving legacy paths without a reverse proxy.
|
||||
|
||||
## Index
|
||||
|
||||
`zs` supports indexing documents and providing a query command for quickly
|
||||
retrieving documents or metadata on documents quickly and easily without having
|
||||
to spend time walking directory trees and parses files over and over again.
|
||||
|
||||
### Building the index
|
||||
|
||||
```
|
||||
zs index -r . -i "posts/*.md" -i "pages/*.md" -e ".cache/*" -o .cache/zs/index.json
|
||||
```
|
||||
|
||||
Flags:
|
||||
- `-r, --root` Root directory to index (default `.`).
|
||||
- `-i, --include` Glob include patterns (repeatable). If omitted, all parsable content files are considered.
|
||||
- `-e, --exclude` Glob exclude patterns (repeatable).
|
||||
- `-o, --out` Output index path (default `.cache/zs/index.json`).
|
||||
|
||||
### Querying the index
|
||||
|
||||
- Extract variables from a document (front matter + defaults):
|
||||
|
||||
```
|
||||
zs query vars posts/2025-01-02-hello.md
|
||||
```
|
||||
|
||||
- Find neighbors (prev/next in chronological order within the same directory) using the on-disk index:
|
||||
|
||||
```
|
||||
zs query neighbors posts/2025-01-02-hello.md -o .cache/zs/index.json
|
||||
```
|
||||
|
||||
- List posts by tag / year / month:
|
||||
|
||||
```
|
||||
zs query list --tag go --year 2025 --month 9
|
||||
```
|
||||
|
||||
The on-disk index is a single JSON file containing an array of records with `path`, `url`, `title`, `date`, `vars`, `tags`, and `neighbors`.
|
||||
|
||||
---
|
||||
|
||||
## Command line usage
|
||||
|
||||
- `zs build` re-builds your site.
|
||||
|
||||
388
index_query.go
Normal file
388
index_query.go
Normal file
@@ -0,0 +1,388 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
gitIgnore "github.com/sabhiram/go-gitignore"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
const defaultIndexFile = ".cache.json"
|
||||
|
||||
// IndexRecord captures the minimal data we want to query quickly.
|
||||
type IndexRecord struct {
|
||||
Path string `json:"path"` // e.g. posts/2025-01-02-hello.md
|
||||
URL string `json:"url"` // computed url (without leading /)
|
||||
Title string `json:"title"` // from front matter or fallback
|
||||
Date time.Time `json:"date"` // from vars.date | filename | mtime
|
||||
Vars map[string]string `json:"vars"` // all vars for convenience
|
||||
Tags []string `json:"tags"` // split of vars["tags"] on commas
|
||||
Neighbors struct {
|
||||
Prev *string `json:"prev,omitempty"`
|
||||
Next *string `json:"next,omitempty"`
|
||||
} `json:"neighbors"`
|
||||
}
|
||||
|
||||
// IndexFile is the materialized index saved on disk.
|
||||
type IndexFile struct {
|
||||
Generated time.Time `json:"generated"`
|
||||
Records []IndexRecord `json:"records"`
|
||||
ByPath map[string]int `json:"-"` // not serialized; built at load time
|
||||
}
|
||||
|
||||
// parseDate attempts a few common locations: front matter "date", filename, mtime.
|
||||
func parseDate(path string, vars Vars, info os.FileInfo) time.Time {
|
||||
// front matter "date"
|
||||
if v, ok := vars["date"]; ok && strings.TrimSpace(v) != "" {
|
||||
// try RFC3339, "2006-01-02", or "2006-01-02 15:04"
|
||||
layouts := []string{time.RFC3339, "2006-01-02", "2006-01-02 15:04"}
|
||||
for _, layout := range layouts {
|
||||
if t, err := time.Parse(layout, v); err == nil {
|
||||
return t
|
||||
}
|
||||
}
|
||||
}
|
||||
// filename prefix "YYYY-MM-DD-..."
|
||||
base := filepath.Base(path)
|
||||
if len(base) >= 10 {
|
||||
prefix := base[:10]
|
||||
if t, err := time.Parse("2006-01-02", prefix); err == nil {
|
||||
return t
|
||||
}
|
||||
}
|
||||
// fallback to mtime
|
||||
return info.ModTime()
|
||||
}
|
||||
|
||||
// splitTags converts common comma/space separated tags into list.
|
||||
func splitTags(tags string) []string {
|
||||
if tags == "" {
|
||||
return nil
|
||||
}
|
||||
// support "a, b, c" or "a b c"
|
||||
// prefer commas if present
|
||||
if strings.Contains(tags, ",") {
|
||||
parts := strings.Split(tags, ",")
|
||||
out := make([]string, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if p != "" {
|
||||
out = append(out, p)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
fields := strings.Fields(tags)
|
||||
if len(fields) == 0 {
|
||||
return nil
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
// ----- `zs index` -----
|
||||
|
||||
var indexCmd = &cobra.Command{
|
||||
Use: "index",
|
||||
Short: "Index documents in a zs site for fast queries",
|
||||
Long: "Walk the site root, parse front matter, and build a queryable index for vars, tags, dates, and neighbors.",
|
||||
RunE: func(_ *cobra.Command, _ []string) error {
|
||||
// Resolve .zsignore relative to root (do not rely on global Ignore)
|
||||
var Ignore *gitIgnore.GitIgnore
|
||||
if obj, err := gitIgnore.CompileIgnoreFile(ZSIGNORE); err == nil {
|
||||
Ignore = obj
|
||||
} else {
|
||||
Ignore = gitIgnore.CompileIgnoreLines(DefaultIgnore)
|
||||
}
|
||||
|
||||
var records []IndexRecord
|
||||
|
||||
err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
// log and continue
|
||||
log.WithError(err).Warn("walk error")
|
||||
return nil
|
||||
}
|
||||
|
||||
// ignore hidden files and directories and ignored patterns
|
||||
// except for the default .routes file (redirects, rewrites, etc)
|
||||
if filepath.Base(path) != ".routes" && (filepath.Base(path)[0] == '.' || strings.HasPrefix(path, ".") || Ignore.MatchesPath(path)) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// inform user about fs walk errors, but continue iteration
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("error walking directory")
|
||||
return nil
|
||||
}
|
||||
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Debugf("indexing %s", path)
|
||||
|
||||
// Only index files we can parse or are likely content: markdown, html, txt by default
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".md", ".markdown", ".html", ".htm", ".txt":
|
||||
default:
|
||||
// we still allow if file starts with front matter
|
||||
// but to keep index small, skip binaries
|
||||
return nil
|
||||
}
|
||||
|
||||
vars, _, err := getVars(path, globals())
|
||||
if err != nil {
|
||||
log.WithError(err).Warnf("getVars failed for %s", path)
|
||||
return nil
|
||||
}
|
||||
if vars == nil {
|
||||
// ignored by .zsignore
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip drafts globally
|
||||
if strings.EqualFold(vars["draft"], "true") {
|
||||
return nil
|
||||
}
|
||||
|
||||
rec := IndexRecord{
|
||||
Path: filepath.ToSlash(path),
|
||||
URL: strings.TrimPrefix(vars["url"], "/"),
|
||||
Title: vars["title"],
|
||||
Vars: map[string]string{},
|
||||
}
|
||||
for k, v := range vars {
|
||||
rec.Vars[k] = v
|
||||
}
|
||||
rec.Tags = splitTags(vars["tags"])
|
||||
rec.Date = parseDate(path, vars, info)
|
||||
|
||||
records = append(records, rec)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Sort newest first (descending date), tiebreak by path
|
||||
sort.SliceStable(records, func(i, j int) bool {
|
||||
if records[i].Date.Equal(records[j].Date) {
|
||||
return records[i].Path < records[j].Path
|
||||
}
|
||||
return records[i].Date.After(records[j].Date)
|
||||
})
|
||||
|
||||
// Build neighbors in the "posts/" namespace if present, otherwise global
|
||||
var idxByPath = map[string]int{}
|
||||
for i := range records {
|
||||
idxByPath[records[i].Path] = i
|
||||
}
|
||||
|
||||
// group by dir (so neighbors stay within directory like posts/)
|
||||
groups := map[string][]int{}
|
||||
for i, r := range records {
|
||||
dir := filepath.Dir(r.Path)
|
||||
groups[dir] = append(groups[dir], i)
|
||||
}
|
||||
for _, arr := range groups {
|
||||
for j, i := range arr {
|
||||
var prev, next *string
|
||||
if j+1 < len(arr) {
|
||||
p := records[arr[j+1]].Path
|
||||
prev = &p
|
||||
}
|
||||
if j-1 >= 0 {
|
||||
n := records[arr[j-1]].Path
|
||||
next = &n
|
||||
}
|
||||
records[i].Neighbors.Prev = prev
|
||||
records[i].Neighbors.Next = next
|
||||
}
|
||||
}
|
||||
|
||||
index := IndexFile{
|
||||
Generated: time.Now(),
|
||||
Records: records,
|
||||
}
|
||||
|
||||
f, err := os.Create(defaultIndexFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
enc := json.NewEncoder(f)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(&index); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stdout, "%d records written to %s\n", len(records), defaultIndexFile)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func loadIndex(path string) (*IndexFile, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var idx IndexFile
|
||||
if err := json.Unmarshal(b, &idx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
idx.ByPath = make(map[string]int, len(idx.Records))
|
||||
for i, r := range idx.Records {
|
||||
idx.ByPath[r.Path] = i
|
||||
}
|
||||
return &idx, nil
|
||||
}
|
||||
|
||||
// ----- `zs query` -----
|
||||
|
||||
var queryCmd = &cobra.Command{
|
||||
Use: "query",
|
||||
Short: "Query site metadata and the on-disk index",
|
||||
}
|
||||
|
||||
var queryVarsCmd = &cobra.Command{
|
||||
Use: "vars <path>",
|
||||
Short: "Extract variables from a document",
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(_ *cobra.Command, args []string) error {
|
||||
path := args[0]
|
||||
|
||||
// fast path: index
|
||||
if idx, err := loadIndex(defaultIndexFile); err == nil {
|
||||
key := filepath.ToSlash(path)
|
||||
if i, ok := idx.ByPath[key]; ok {
|
||||
v := idx.Records[i].Vars
|
||||
// ensure url/title present from record
|
||||
if v["url"] == "" {
|
||||
v["url"] = idx.Records[i].URL
|
||||
}
|
||||
if v["title"] == "" {
|
||||
v["title"] = idx.Records[i].Title
|
||||
}
|
||||
json.NewEncoder(os.Stdout).Encode(v)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
vars, _, err := getVars(path, globals())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if vars == nil {
|
||||
return errors.New("no variables (possibly ignored by .zsignore)")
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(vars)
|
||||
},
|
||||
}
|
||||
|
||||
var queryNeighborsCmd = &cobra.Command{
|
||||
Use: "neighbors <path>",
|
||||
Short: "List neighbors (prev/next) of a document using the index",
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(_ *cobra.Command, args []string) error {
|
||||
idx, err := loadIndex(defaultIndexFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load index %s: %w", defaultIndexFile, err)
|
||||
}
|
||||
key := filepath.ToSlash(args[0])
|
||||
i, ok := idx.ByPath[key]
|
||||
if !ok {
|
||||
return fmt.Errorf("path %q not found in index", key)
|
||||
}
|
||||
type Neighbor struct {
|
||||
Path string `json:"path"`
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
}
|
||||
var out struct {
|
||||
Prev *Neighbor `json:"prev,omitempty"`
|
||||
Next *Neighbor `json:"next,omitempty"`
|
||||
}
|
||||
rec := idx.Records[i]
|
||||
if rec.Neighbors.Prev != nil {
|
||||
p := *rec.Neighbors.Prev
|
||||
pi := idx.ByPath[p]
|
||||
r := idx.Records[pi]
|
||||
out.Prev = &Neighbor{Path: r.Path, URL: r.URL, Title: r.Title}
|
||||
}
|
||||
if rec.Neighbors.Next != nil {
|
||||
n := *rec.Neighbors.Next
|
||||
ni := idx.ByPath[n]
|
||||
r := idx.Records[ni]
|
||||
out.Next = &Neighbor{Path: r.Path, URL: r.URL, Title: r.Title}
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(out)
|
||||
},
|
||||
}
|
||||
|
||||
var queryListCmd = &cobra.Command{
|
||||
Use: "list",
|
||||
Short: "List documents from the index (filter by tag/year/month)",
|
||||
RunE: func(cmd *cobra.Command, _ []string) error {
|
||||
idx, err := loadIndex(defaultIndexFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tag, _ := cmd.Flags().GetString("tag")
|
||||
year, _ := cmd.Flags().GetInt("year")
|
||||
month, _ := cmd.Flags().GetInt("month")
|
||||
|
||||
var out []IndexRecord
|
||||
for _, r := range idx.Records {
|
||||
if tag != "" {
|
||||
found := false
|
||||
for _, t := range r.Tags {
|
||||
if strings.EqualFold(t, tag) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if year != 0 && r.Date.Year() != year {
|
||||
continue
|
||||
}
|
||||
if month != 0 && int(r.Date.Month()) != month {
|
||||
continue
|
||||
}
|
||||
out = append(out, r)
|
||||
}
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(out)
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Sub-commands
|
||||
queryCmd.AddCommand(queryVarsCmd)
|
||||
queryCmd.AddCommand(queryNeighborsCmd)
|
||||
queryListCmd.Flags().String("tag", "", "filter by tag")
|
||||
queryListCmd.Flags().Int("year", 0, "filter by year (YYYY)")
|
||||
queryListCmd.Flags().Int("month", 0, "filter by month (1-12)")
|
||||
queryCmd.AddCommand(queryListCmd)
|
||||
|
||||
// Wire into root in their own init() file scope
|
||||
RootCmd.AddCommand(indexCmd)
|
||||
RootCmd.AddCommand(queryCmd)
|
||||
}
|
||||
Reference in New Issue
Block a user