summaryrefslogtreecommitdiff
path: root/rag
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2026-02-25 07:51:24 +0300
committerGrail Finder <wohilas@gmail.com>2026-02-25 07:51:24 +0300
commit6664c1a0fc9e6688e67ffcba9ce348f0a95afc6a (patch)
tree83a9144463f6d9f31c51384a18d8282b24e50322 /rag
parente0c3fe554fe5057891962234076d061ca58f694e (diff)
Dep (rag): better extractors
Diffstat (limited to 'rag')
-rw-r--r--rag/extractors.go55
1 files changed, 33 insertions, 22 deletions
diff --git a/rag/extractors.go b/rag/extractors.go
index fcc6a2a..1cf97af 100644
--- a/rag/extractors.go
+++ b/rag/extractors.go
@@ -10,21 +10,24 @@ import (
"path"
"strings"
- "github.com/huantt/plaintext-extractor"
+ "github.com/PuerkitoBio/goquery"
"github.com/ledongthuc/pdf"
"github.com/n3integration/epub"
+ "github.com/yuin/goldmark"
+ "github.com/yuin/goldmark/extension"
+ "github.com/yuin/goldmark/parser"
+ "github.com/yuin/goldmark/renderer/html"
)
func ExtractText(fpath string) (string, error) {
ext := strings.ToLower(path.Ext(fpath))
-
switch ext {
case ".txt":
return extractTextFromFile(fpath)
case ".md", ".markdown":
return extractTextFromMarkdown(fpath)
case ".html", ".htm":
- return extractTextFromHtml(fpath)
+ return extractTextFromHtmlFile(fpath)
case ".epub":
return extractTextFromEpub(fpath)
case ".pdf":
@@ -42,30 +45,48 @@ func extractTextFromFile(fpath string) (string, error) {
return string(data), nil
}
-func extractTextFromMarkdown(fpath string) (string, error) {
+func extractTextFromHtmlFile(fpath string) (string, error) {
data, err := os.ReadFile(fpath)
if err != nil {
return "", err
}
- extractor := plaintext.NewMarkdownExtractor()
- text, err := extractor.PlainText(string(data))
+ return extractTextFromHtmlContent(data)
+}
+
+// non utf-8 encoding?
+func extractTextFromHtmlContent(data []byte) (string, error) {
+ doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
if err != nil {
return "", err
}
- return *text, nil
+ // Remove script and style tags
+ doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) {
+ s.Remove()
+ })
+ // Get text and clean it
+ text := doc.Text()
+ // Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces
+ cleaned := strings.Join(strings.Fields(text), " ")
+ return cleaned, nil
}
-func extractTextFromHtml(fpath string) (string, error) {
+func extractTextFromMarkdown(fpath string) (string, error) {
data, err := os.ReadFile(fpath)
if err != nil {
return "", err
}
- extractor := plaintext.NewHtmlExtractor()
- text, err := extractor.PlainText(string(data))
- if err != nil {
+ // Convert markdown to HTML
+ md := goldmark.New(
+ goldmark.WithExtensions(extension.GFM),
+ goldmark.WithParserOptions(parser.WithAutoHeadingID()),
+ goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed
+ )
+ var buf bytes.Buffer
+ if err := md.Convert(data, &buf); err != nil {
return "", err
}
- return *text, nil
+ // Now extract text from the resulting HTML (using goquery or similar)
+ return extractTextFromHtmlContent(buf.Bytes())
}
func extractTextFromEpub(fpath string) (string, error) {
@@ -74,30 +95,24 @@ func extractTextFromEpub(fpath string) (string, error) {
return "", fmt.Errorf("failed to open epub: %w", err)
}
defer book.Close()
-
var sb strings.Builder
-
err = book.Each(func(title string, xhtml io.ReadCloser) {
if sb.Len() > 0 {
sb.WriteString("\n\n")
}
sb.WriteString(title)
sb.WriteString("\n")
-
buf, readErr := io.ReadAll(xhtml)
if readErr == nil {
sb.WriteString(stripHTML(string(buf)))
}
})
-
if err != nil {
return "", fmt.Errorf("failed to iterate epub chapters: %w", err)
}
-
if sb.Len() == 0 {
return "", errors.New("no content extracted from epub")
}
-
return sb.String(), nil
}
@@ -127,7 +142,6 @@ func extractTextFromPdf(fpath string) (string, error) {
return string(out), nil
}
}
-
return extractTextFromPdfPureGo(fpath)
}
@@ -137,17 +151,14 @@ func extractTextFromPdfPureGo(fpath string) (string, error) {
return "", fmt.Errorf("failed to open pdf: %w", err)
}
defer df.Close()
-
textReader, err := r.GetPlainText()
if err != nil {
return "", fmt.Errorf("failed to extract text from pdf: %w", err)
}
-
var buf bytes.Buffer
_, err = io.Copy(&buf, textReader)
if err != nil {
return "", fmt.Errorf("failed to read pdf text: %w", err)
}
-
return buf.String(), nil
}