diff options
| author | Grail Finder <wohilas@gmail.com> | 2026-02-25 14:54:10 +0300 |
|---|---|---|
| committer | Grail Finder <wohilas@gmail.com> | 2026-02-25 14:54:10 +0300 |
| commit | b386c1181f0424418999fe143f1285d395b0db0f (patch) | |
| tree | 33ea22a1e6c21c3a4fd9d432ed604a344cffa0c0 /rag/extractors.go | |
| parent | b8e7649e69ef26cccd63371af12fbacd8c309fd8 (diff) | |
Fix (rag): epub load
Diffstat (limited to 'rag/extractors.go')
| -rw-r--r-- | rag/extractors.go | 38 |
1 files changed, 29 insertions, 9 deletions
diff --git a/rag/extractors.go b/rag/extractors.go index 1cf97af..4255fdb 100644 --- a/rag/extractors.go +++ b/rag/extractors.go @@ -1,6 +1,7 @@ package rag import ( + "archive/zip" "bytes" "errors" "fmt" @@ -12,7 +13,6 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/ledongthuc/pdf" - "github.com/n3integration/epub" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/parser" @@ -90,26 +90,46 @@ func extractTextFromMarkdown(fpath string) (string, error) { } func extractTextFromEpub(fpath string) (string, error) { - book, err := epub.Open(fpath) + r, err := zip.OpenReader(fpath) if err != nil { return "", fmt.Errorf("failed to open epub: %w", err) } - defer book.Close() + defer r.Close() + var sb strings.Builder - err = book.Each(func(title string, xhtml io.ReadCloser) { + + for _, f := range r.File { + ext := strings.ToLower(path.Ext(f.Name)) + if ext != ".xhtml" && ext != ".html" && ext != ".htm" && ext != ".xml" { + continue + } + + // Skip manifest, toc, ncx files - they don't contain book content + nameLower := strings.ToLower(f.Name) + if strings.Contains(nameLower, "toc") || strings.Contains(nameLower, "nav") || + strings.Contains(nameLower, "manifest") || strings.Contains(nameLower, ".opf") || + strings.HasSuffix(nameLower, ".ncx") { + continue + } + + rc, err := f.Open() + if err != nil { + continue + } + if sb.Len() > 0 { sb.WriteString("\n\n") } - sb.WriteString(title) + sb.WriteString(f.Name) sb.WriteString("\n") - buf, readErr := io.ReadAll(xhtml) + + buf, readErr := io.ReadAll(rc) + rc.Close() if readErr == nil { sb.WriteString(stripHTML(string(buf))) } - }) - if err != nil { - return "", fmt.Errorf("failed to iterate epub chapters: %w", err) } + if sb.Len() == 0 { return "", errors.New("no content extracted from epub") } |
