diff options
Diffstat (limited to 'rag/extractors.go')
| -rw-r--r-- | rag/extractors.go | 181 |
1 files changed, 181 insertions, 0 deletions
diff --git a/rag/extractors.go b/rag/extractors.go new file mode 100644 index 0000000..0f9f3f4 --- /dev/null +++ b/rag/extractors.go @@ -0,0 +1,181 @@ +package rag + +import ( + "archive/zip" + "bytes" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/ledongthuc/pdf" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer/html" +) + +func ExtractText(fpath string) (string, error) { + ext := strings.ToLower(path.Ext(fpath)) + switch ext { + case ".txt": + return extractTextFromFile(fpath) + case ".md", ".markdown": + return extractTextFromMarkdown(fpath) + case ".html", ".htm": + return extractTextFromHtmlFile(fpath) + case ".epub": + return extractTextFromEpub(fpath) + case ".pdf": + return extractTextFromPdf(fpath) + default: + return "", fmt.Errorf("unsupported file format: %s", ext) + } +} + +func extractTextFromFile(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + return string(data), nil +} + +func extractTextFromHtmlFile(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + return extractTextFromHtmlContent(data) +} + +// non utf-8 encoding? +func extractTextFromHtmlContent(data []byte) (string, error) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) + if err != nil { + return "", err + } + // Remove script and style tags + doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) { + s.Remove() + }) + // Get text and clean it + text := doc.Text() + // Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces + cleaned := strings.Join(strings.Fields(text), " ") + return cleaned, nil +} + +func extractTextFromMarkdown(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + // Convert markdown to HTML + md := goldmark.New( + goldmark.WithExtensions(extension.GFM), + goldmark.WithParserOptions(parser.WithAutoHeadingID()), + goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed + ) + var buf bytes.Buffer + if err := md.Convert(data, &buf); err != nil { + return "", err + } + // Now extract text from the resulting HTML (using goquery or similar) + return extractTextFromHtmlContent(buf.Bytes()) +} + +func extractTextFromEpub(fpath string) (string, error) { + r, err := zip.OpenReader(fpath) + if err != nil { + return "", fmt.Errorf("failed to open epub: %w", err) + } + defer r.Close() + var sb strings.Builder + for _, f := range r.File { + ext := strings.ToLower(path.Ext(f.Name)) + if ext != ".xhtml" && ext != ".html" && ext != ".htm" && ext != ".xml" { + continue + } + + // Skip manifest, toc, ncx files - they don't contain book content + nameLower := strings.ToLower(f.Name) + if strings.Contains(nameLower, "toc") || strings.Contains(nameLower, "nav") || + strings.Contains(nameLower, "manifest") || strings.Contains(nameLower, ".opf") || + strings.HasSuffix(nameLower, ".ncx") { + continue + } + + rc, err := f.Open() + if err != nil { + continue + } + + if sb.Len() > 0 { + sb.WriteString("\n\n") + } + sb.WriteString(f.Name) + sb.WriteString("\n") + + buf, readErr := io.ReadAll(rc) + rc.Close() + if readErr == nil { + sb.WriteString(stripHTML(string(buf))) + } + } + if sb.Len() == 0 { + return "", errors.New("no content extracted from epub") + } + return sb.String(), nil +} + +func stripHTML(html string) string { + var sb strings.Builder + inTag := false + for _, r := range html { + switch r { + case '<': + inTag = true + case '>': + inTag = false + default: + if !inTag { + sb.WriteRune(r) + } + } + } + return sb.String() +} + +func extractTextFromPdf(fpath string) (string, error) { + _, err := exec.LookPath("pdftotext") + if err == nil { + out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output() + if err == nil && len(out) > 0 { + return string(out), nil + } + } + return extractTextFromPdfPureGo(fpath) +} + +func extractTextFromPdfPureGo(fpath string) (string, error) { + df, r, err := pdf.Open(fpath) + if err != nil { + return "", fmt.Errorf("failed to open pdf: %w", err) + } + defer df.Close() + textReader, err := r.GetPlainText() + if err != nil { + return "", fmt.Errorf("failed to extract text from pdf: %w", err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, textReader) + if err != nil { + return "", fmt.Errorf("failed to read pdf text: %w", err) + } + return buf.String(), nil +} |
