summaryrefslogtreecommitdiff
path: root/rag/extractors.go
diff options
context:
space:
mode:
Diffstat (limited to 'rag/extractors.go')
-rw-r--r--rag/extractors.go181
1 files changed, 181 insertions, 0 deletions
diff --git a/rag/extractors.go b/rag/extractors.go
new file mode 100644
index 0000000..0f9f3f4
--- /dev/null
+++ b/rag/extractors.go
@@ -0,0 +1,181 @@
+package rag
+
+import (
+ "archive/zip"
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/ledongthuc/pdf"
+ "github.com/yuin/goldmark"
+ "github.com/yuin/goldmark/extension"
+ "github.com/yuin/goldmark/parser"
+ "github.com/yuin/goldmark/renderer/html"
+)
+
+func ExtractText(fpath string) (string, error) {
+ ext := strings.ToLower(path.Ext(fpath))
+ switch ext {
+ case ".txt":
+ return extractTextFromFile(fpath)
+ case ".md", ".markdown":
+ return extractTextFromMarkdown(fpath)
+ case ".html", ".htm":
+ return extractTextFromHtmlFile(fpath)
+ case ".epub":
+ return extractTextFromEpub(fpath)
+ case ".pdf":
+ return extractTextFromPdf(fpath)
+ default:
+ return "", fmt.Errorf("unsupported file format: %s", ext)
+ }
+}
+
+func extractTextFromFile(fpath string) (string, error) {
+ data, err := os.ReadFile(fpath)
+ if err != nil {
+ return "", err
+ }
+ return string(data), nil
+}
+
+func extractTextFromHtmlFile(fpath string) (string, error) {
+ data, err := os.ReadFile(fpath)
+ if err != nil {
+ return "", err
+ }
+ return extractTextFromHtmlContent(data)
+}
+
+// non utf-8 encoding?
+func extractTextFromHtmlContent(data []byte) (string, error) {
+ doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
+ if err != nil {
+ return "", err
+ }
+ // Remove script and style tags
+ doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) {
+ s.Remove()
+ })
+ // Get text and clean it
+ text := doc.Text()
+ // Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces
+ cleaned := strings.Join(strings.Fields(text), " ")
+ return cleaned, nil
+}
+
+func extractTextFromMarkdown(fpath string) (string, error) {
+ data, err := os.ReadFile(fpath)
+ if err != nil {
+ return "", err
+ }
+ // Convert markdown to HTML
+ md := goldmark.New(
+ goldmark.WithExtensions(extension.GFM),
+ goldmark.WithParserOptions(parser.WithAutoHeadingID()),
+ goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed
+ )
+ var buf bytes.Buffer
+ if err := md.Convert(data, &buf); err != nil {
+ return "", err
+ }
+ // Now extract text from the resulting HTML (using goquery or similar)
+ return extractTextFromHtmlContent(buf.Bytes())
+}
+
+func extractTextFromEpub(fpath string) (string, error) {
+ r, err := zip.OpenReader(fpath)
+ if err != nil {
+ return "", fmt.Errorf("failed to open epub: %w", err)
+ }
+ defer r.Close()
+ var sb strings.Builder
+ for _, f := range r.File {
+ ext := strings.ToLower(path.Ext(f.Name))
+ if ext != ".xhtml" && ext != ".html" && ext != ".htm" && ext != ".xml" {
+ continue
+ }
+
+ // Skip manifest, toc, ncx files - they don't contain book content
+ nameLower := strings.ToLower(f.Name)
+ if strings.Contains(nameLower, "toc") || strings.Contains(nameLower, "nav") ||
+ strings.Contains(nameLower, "manifest") || strings.Contains(nameLower, ".opf") ||
+ strings.HasSuffix(nameLower, ".ncx") {
+ continue
+ }
+
+ rc, err := f.Open()
+ if err != nil {
+ continue
+ }
+
+ if sb.Len() > 0 {
+ sb.WriteString("\n\n")
+ }
+ sb.WriteString(f.Name)
+ sb.WriteString("\n")
+
+ buf, readErr := io.ReadAll(rc)
+ rc.Close()
+ if readErr == nil {
+ sb.WriteString(stripHTML(string(buf)))
+ }
+ }
+ if sb.Len() == 0 {
+ return "", errors.New("no content extracted from epub")
+ }
+ return sb.String(), nil
+}
+
+func stripHTML(html string) string {
+ var sb strings.Builder
+ inTag := false
+ for _, r := range html {
+ switch r {
+ case '<':
+ inTag = true
+ case '>':
+ inTag = false
+ default:
+ if !inTag {
+ sb.WriteRune(r)
+ }
+ }
+ }
+ return sb.String()
+}
+
+func extractTextFromPdf(fpath string) (string, error) {
+ _, err := exec.LookPath("pdftotext")
+ if err == nil {
+ out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output()
+ if err == nil && len(out) > 0 {
+ return string(out), nil
+ }
+ }
+ return extractTextFromPdfPureGo(fpath)
+}
+
+func extractTextFromPdfPureGo(fpath string) (string, error) {
+ df, r, err := pdf.Open(fpath)
+ if err != nil {
+ return "", fmt.Errorf("failed to open pdf: %w", err)
+ }
+ defer df.Close()
+ textReader, err := r.GetPlainText()
+ if err != nil {
+ return "", fmt.Errorf("failed to extract text from pdf: %w", err)
+ }
+ var buf bytes.Buffer
+ _, err = io.Copy(&buf, textReader)
+ if err != nil {
+ return "", fmt.Errorf("failed to read pdf text: %w", err)
+ }
+ return buf.String(), nil
+}