1
0
mirror of https://github.com/makew0rld/amfora.git synced 2024-06-21 19:35:23 +00:00
amfora/feeds/feeds.go

468 lines
10 KiB
Go
Raw Normal View History

package feeds
import (
2020-08-16 21:42:45 +00:00
"crypto/sha256"
"encoding/json"
"errors"
"fmt"
"io"
2020-08-16 21:42:45 +00:00
"mime"
2020-08-17 19:33:53 +00:00
urlPkg "net/url"
"os"
2020-08-16 21:42:45 +00:00
"path"
"reflect"
"sort"
"strings"
2020-08-16 21:42:45 +00:00
"sync"
"time"
2020-08-16 21:42:45 +00:00
"github.com/makeworld-the-better-one/amfora/client"
"github.com/makeworld-the-better-one/amfora/config"
"github.com/makeworld-the-better-one/amfora/logger"
2020-08-16 21:42:45 +00:00
"github.com/makeworld-the-better-one/go-gemini"
"github.com/mmcdole/gofeed"
2020-11-19 02:24:26 +00:00
"github.com/spf13/viper"
)
2020-08-17 17:31:45 +00:00
// TODO: Test for deadlocks and whether there should be more
// goroutines for file writing or other things.
2020-08-16 21:42:45 +00:00
var (
ErrSaving = errors.New("couldn't save JSON to disk")
ErrNotSuccess = errors.New("status 20 not returned")
ErrNotFeed = errors.New("not a valid feed")
)
var writeMu = sync.Mutex{} // Prevent concurrent writes to feeds.json file
// LastUpdated is the time when the in-memory data was last updated.
// It can be used to know if the feed page should be regenerated.
var LastUpdated time.Time
// Init should be called after config.Init.
func Init() error {
f, err := os.Open(config.FeedPath)
if err == nil {
// File exists and could be opened
defer f.Close()
fi, err := f.Stat()
if err == nil && fi.Size() > 0 {
// File is not empty
dec := json.NewDecoder(f)
err = dec.Decode(&data)
if err != nil && err != io.EOF {
return fmt.Errorf("feeds.json is corrupted: %w", err) //nolint:goerr113
}
}
} else if !os.IsNotExist(err) {
// There's an error opening the file, but it's not bc is doesn't exist
return fmt.Errorf("open feeds.json error: %w", err) //nolint:goerr113
}
2020-08-16 21:42:45 +00:00
LastUpdated = time.Now()
2020-11-19 02:24:26 +00:00
if viper.GetInt("feeds.update_interval") > 0 {
// Update feeds and pages every so often
go func() {
for {
updateAll()
time.Sleep(time.Duration(viper.GetInt("feeds.update_interval")) * time.Second)
}
}()
} else {
// User disabled automatic feed/page updates
// So just update once at the beginning
go updateAll()
}
2020-08-17 19:33:53 +00:00
return nil
}
2020-08-16 21:42:45 +00:00
// IsTracked returns true if the feed/page URL is already being tracked.
func IsTracked(url string) bool {
logger.Log.Println("feeds.IsTracked called")
2020-08-16 21:42:45 +00:00
data.feedMu.RLock()
for u := range data.Feeds {
if url == u {
2020-08-16 21:42:45 +00:00
data.feedMu.RUnlock()
return true
}
}
2020-08-16 21:42:45 +00:00
data.feedMu.RUnlock()
data.pageMu.RLock()
for u := range data.Pages {
if url == u {
2020-08-16 21:42:45 +00:00
data.pageMu.RUnlock()
return true
}
}
2020-08-16 21:42:45 +00:00
data.pageMu.RUnlock()
return false
}
// GetFeed returns a Feed object and a bool indicating whether the passed
// content was actually recognized as a feed.
func GetFeed(mediatype, filename string, r io.Reader) (*gofeed.Feed, bool) {
logger.Log.Println("feeds.GetFeed called")
if r == nil {
return nil, false
}
// Check mediatype and filename
2020-11-17 16:26:49 +00:00
if mediatype != "application/atom+xml" && mediatype != "application/rss+xml" && mediatype != "application/json+feed" &&
filename != "atom.xml" && filename != "feed.xml" && filename != "feed.json" &&
!strings.HasSuffix(filename, ".atom") && !strings.HasSuffix(filename, ".rss") {
// No part of the above is true
return nil, false
}
feed, err := gofeed.NewParser().Parse(r)
2020-08-16 21:42:45 +00:00
if feed == nil {
return nil, false
}
return feed, err == nil
}
2020-08-28 16:18:30 +00:00
func writeJSON() error {
logger.Log.Println("feeds.writeJSON called")
2020-08-16 21:42:45 +00:00
writeMu.Lock()
defer writeMu.Unlock()
f, err := os.OpenFile(config.FeedPath, os.O_WRONLY|os.O_CREATE, 0666)
if err != nil {
logger.Log.Println("feeds.writeJSON error", err)
return err
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetEscapeHTML(false)
enc.SetIndent("", " ")
2020-08-16 21:42:45 +00:00
2020-08-17 17:31:45 +00:00
data.Lock()
logger.Log.Println("feeds.writeJSON acquired data lock")
err = enc.Encode(&data)
2020-08-17 17:31:45 +00:00
data.Unlock()
2020-08-16 21:42:45 +00:00
if err != nil {
logger.Log.Println("feeds.writeJSON error", err)
}
return err
}
// AddFeed stores a feed.
2020-08-16 21:42:45 +00:00
// It can be used to update a feed for a URL, although the package
// will handle that on its own.
func AddFeed(url string, feed *gofeed.Feed) error {
logger.Log.Println("feeds.AddFeed called")
2020-08-16 21:42:45 +00:00
if feed == nil {
panic("feed is nil")
}
// Remove any unused fields to save memory and disk space
feed.Image = nil
feed.Generator = ""
feed.Categories = nil
feed.DublinCoreExt = nil
feed.ITunesExt = nil
feed.Custom = nil
2020-08-16 21:42:45 +00:00
for _, item := range feed.Items {
item.Description = ""
2020-08-16 21:42:45 +00:00
item.Content = ""
item.Image = nil
item.Categories = nil
item.Enclosures = nil
item.DublinCoreExt = nil
item.ITunesExt = nil
item.Extensions = nil
item.Custom = nil
2020-08-16 21:42:45 +00:00
}
data.feedMu.Lock()
oldFeed, ok := data.Feeds[url]
if !ok || !reflect.DeepEqual(feed, oldFeed) {
// Feeds are different, or there was never an old one
LastUpdated = time.Now()
data.Feeds[url] = feed
data.feedMu.Unlock()
err := writeJSON()
if err != nil {
return ErrSaving
}
} else {
2020-08-16 21:42:45 +00:00
data.feedMu.Unlock()
}
return nil
}
// AddPage stores a page to track for changes.
// It can be used to update the page as well, although the package
// will handle that on its own.
func AddPage(url string, r io.Reader) error {
logger.Log.Println("feeds.AddPage called")
if r == nil {
return nil
}
h := sha256.New()
if _, err := io.Copy(h, r); err != nil {
return err
}
newHash := fmt.Sprintf("%x", h.Sum(nil))
2020-08-16 21:42:45 +00:00
data.pageMu.Lock()
_, ok := data.Pages[url]
if !ok || data.Pages[url].Hash != newHash {
// Page content is different, or it didn't exist
LastUpdated = time.Now()
data.Pages[url] = &pageJSON{
Hash: newHash,
Changed: time.Now().UTC(),
}
2020-08-16 21:42:45 +00:00
data.pageMu.Unlock()
err := writeJSON()
if err != nil {
return ErrSaving
}
} else {
data.pageMu.Unlock()
}
return nil
}
2020-08-16 21:42:45 +00:00
func updateFeed(url string) error {
logger.Log.Println("feeds.updateFeed called")
2020-08-16 21:42:45 +00:00
res, err := client.Fetch(url)
if err != nil {
if res != nil {
res.Body.Close()
}
return err
}
defer res.Body.Close()
if res.Status != gemini.StatusSuccess {
return ErrNotSuccess
}
mediatype, _, err := mime.ParseMediaType(res.Meta)
if err != nil {
return err
}
filename := path.Base(url)
feed, ok := GetFeed(mediatype, filename, res.Body)
if !ok {
return ErrNotFeed
}
return AddFeed(url, feed)
}
func updatePage(url string) error {
logger.Log.Println("feeds.updatePage called")
2020-08-16 21:42:45 +00:00
res, err := client.Fetch(url)
if err != nil {
if res != nil {
res.Body.Close()
}
return err
}
defer res.Body.Close()
if res.Status != gemini.StatusSuccess {
return ErrNotSuccess
}
2020-08-17 17:31:45 +00:00
return AddPage(url, res.Body)
2020-08-16 21:42:45 +00:00
}
2020-11-17 16:26:49 +00:00
// updateAll updates all feeds and pages using workers.
// It only returns once all the workers are done.
2020-08-16 21:42:45 +00:00
func updateAll() {
logger.Log.Println("feeds.updateAll called")
2020-08-17 17:31:45 +00:00
// TODO: Is two goroutines the right amount?
worker := func(jobs <-chan [2]string, wg *sync.WaitGroup) {
// Each job is: [2]string{<type>, "url"}
2020-08-17 17:31:45 +00:00
// where <type> is "feed" or "page"
defer wg.Done()
for j := range jobs {
if j[0] == "feed" {
2020-08-28 16:18:30 +00:00
updateFeed(j[1]) //nolint:errcheck
} else if j[0] == "page" {
2020-08-28 16:18:30 +00:00
updatePage(j[1]) //nolint:errcheck
2020-08-17 17:31:45 +00:00
}
}
}
var wg sync.WaitGroup
data.RLock()
numJobs := len(data.Feeds) + len(data.Pages)
jobs := make(chan [2]string, numJobs)
if numJobs == 0 {
data.RUnlock()
return
}
numWorkers := viper.GetInt("feeds.workers")
2020-11-19 02:24:26 +00:00
if numWorkers < 1 {
numWorkers = 1
}
// Start workers, waiting for jobs
for w := 0; w < numWorkers; w++ {
2020-08-17 17:31:45 +00:00
wg.Add(1)
go func(i int) {
logger.Log.Println("started worker", i)
worker(jobs, &wg)
logger.Log.Println("ended worker", i)
}(w)
2020-08-17 17:31:45 +00:00
}
// Get map keys in a slice
feedKeys := make([]string, len(data.Feeds))
i := 0
for k := range data.Feeds {
feedKeys[i] = k
i++
}
pageKeys := make([]string, len(data.Pages))
i = 0
for k := range data.Pages {
pageKeys[i] = k
i++
}
data.RUnlock()
for j := 0; j < numJobs; j++ {
if j < len(feedKeys) {
jobs <- [2]string{"feed", feedKeys[j]}
} else {
// In the Pages
jobs <- [2]string{"page", pageKeys[j-len(feedKeys)]}
}
}
close(jobs)
2020-08-16 21:42:45 +00:00
2020-08-17 17:31:45 +00:00
wg.Wait()
2020-08-16 21:42:45 +00:00
}
2020-08-17 19:33:53 +00:00
// GetPageEntries returns the current list of PageEntries
// for use in rendering a page.
// The contents of the returned entries will never change,
// so this function needs to be called again to get updates.
// It always returns sorted entries - by post time, from newest to oldest.
2020-08-17 19:33:53 +00:00
func GetPageEntries() *PageEntries {
logger.Log.Println("feeds.GetPageEntries called")
2020-08-17 19:33:53 +00:00
var pe PageEntries
data.RLock()
for _, feed := range data.Feeds {
for _, item := range feed.Items {
var pub time.Time
// Try to use updated time first, then published
2020-08-17 19:33:53 +00:00
if !item.UpdatedParsed.IsZero() {
pub = *item.UpdatedParsed
} else if !item.PublishedParsed.IsZero() {
pub = *item.PublishedParsed
} else {
// No time on the post
pub = time.Now()
}
// Prefer using the feed title over anything else.
// Many feeds in Gemini only have this due to gemfeed's default settings.
prefix := feed.Title
if prefix == "" {
// feed.Title was empty
if feed.Author != nil {
// Prefer using the feed author over the item author
prefix = feed.Author.Name
} else {
if item.Author != nil {
prefix = item.Author.Name
} else {
prefix = "[author unknown]"
}
}
} else {
// There's already a title, so add the author (if exists) to
// the end of the title in parentheses.
// Don't add the author if it's the same as the title.
if feed.Author != nil && feed.Author.Name != prefix {
// Prefer using the feed author over the item author
prefix += " (" + feed.Author.Name + ")"
} else {
if item.Author != nil && item.Author.Name != prefix {
prefix += " (" + item.Author.Name + ")"
}
}
}
2020-08-17 19:33:53 +00:00
pe.Entries = append(pe.Entries, &PageEntry{
Prefix: prefix,
2020-08-17 19:33:53 +00:00
Title: item.Title,
URL: item.Link,
Published: pub,
})
}
}
for url, page := range data.Pages {
parsed, _ := urlPkg.Parse(url)
// Path is title
title := parsed.Path
if strings.HasPrefix(title, "/~") {
// A user dir
title = title[2:] // Remove beginning slash and tilde
// Remove trailing slash if the root of a user dir is being tracked
if strings.Count(title, "/") <= 1 && title[len(title)-1] == '/' {
title = title[:len(title)-1]
}
} else if strings.HasPrefix(title, "/users/") {
// "/users/" is removed for aesthetics when tracking hosted users
title = strings.TrimPrefix(title, "/users/")
title = strings.TrimPrefix(title, "~") // Remove leading tilde
// Remove trailing slash if the root of a user dir is being tracked
if strings.Count(title, "/") <= 1 && title[len(title)-1] == '/' {
title = title[:len(title)-1]
}
}
2020-08-17 19:33:53 +00:00
pe.Entries = append(pe.Entries, &PageEntry{
Prefix: parsed.Host,
Title: title,
2020-08-17 19:33:53 +00:00
URL: url,
Published: page.Changed,
})
}
data.RUnlock()
sort.Sort(&pe)
return &pe
}