summaryrefslogtreecommitdiff
path: root/rag
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2026-02-25 06:51:02 +0300
committerGrail Finder <wohilas@gmail.com>2026-02-25 06:51:02 +0300
commite0c3fe554fe5057891962234076d061ca58f694e (patch)
tree05fe1e7588995ff4117e707fe6c546004c8a54af /rag
parent40943ff4d34a7d77ece2c8e3a427b1cf18eccd6a (diff)
Feat: rag text extractors
Diffstat (limited to 'rag')
-rw-r--r--rag/extractors.go153
-rw-r--r--rag/rag.go4
2 files changed, 154 insertions, 3 deletions
diff --git a/rag/extractors.go b/rag/extractors.go
new file mode 100644
index 0000000..fcc6a2a
--- /dev/null
+++ b/rag/extractors.go
@@ -0,0 +1,153 @@
+package rag
+
+import (
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path"
+ "strings"
+
+ "github.com/huantt/plaintext-extractor"
+ "github.com/ledongthuc/pdf"
+ "github.com/n3integration/epub"
+)
+
+func ExtractText(fpath string) (string, error) {
+ ext := strings.ToLower(path.Ext(fpath))
+
+ switch ext {
+ case ".txt":
+ return extractTextFromFile(fpath)
+ case ".md", ".markdown":
+ return extractTextFromMarkdown(fpath)
+ case ".html", ".htm":
+ return extractTextFromHtml(fpath)
+ case ".epub":
+ return extractTextFromEpub(fpath)
+ case ".pdf":
+ return extractTextFromPdf(fpath)
+ default:
+ return "", fmt.Errorf("unsupported file format: %s", ext)
+ }
+}
+
+func extractTextFromFile(fpath string) (string, error) {
+ data, err := os.ReadFile(fpath)
+ if err != nil {
+ return "", err
+ }
+ return string(data), nil
+}
+
+func extractTextFromMarkdown(fpath string) (string, error) {
+ data, err := os.ReadFile(fpath)
+ if err != nil {
+ return "", err
+ }
+ extractor := plaintext.NewMarkdownExtractor()
+ text, err := extractor.PlainText(string(data))
+ if err != nil {
+ return "", err
+ }
+ return *text, nil
+}
+
+func extractTextFromHtml(fpath string) (string, error) {
+ data, err := os.ReadFile(fpath)
+ if err != nil {
+ return "", err
+ }
+ extractor := plaintext.NewHtmlExtractor()
+ text, err := extractor.PlainText(string(data))
+ if err != nil {
+ return "", err
+ }
+ return *text, nil
+}
+
+func extractTextFromEpub(fpath string) (string, error) {
+ book, err := epub.Open(fpath)
+ if err != nil {
+ return "", fmt.Errorf("failed to open epub: %w", err)
+ }
+ defer book.Close()
+
+ var sb strings.Builder
+
+ err = book.Each(func(title string, xhtml io.ReadCloser) {
+ if sb.Len() > 0 {
+ sb.WriteString("\n\n")
+ }
+ sb.WriteString(title)
+ sb.WriteString("\n")
+
+ buf, readErr := io.ReadAll(xhtml)
+ if readErr == nil {
+ sb.WriteString(stripHTML(string(buf)))
+ }
+ })
+
+ if err != nil {
+ return "", fmt.Errorf("failed to iterate epub chapters: %w", err)
+ }
+
+ if sb.Len() == 0 {
+ return "", errors.New("no content extracted from epub")
+ }
+
+ return sb.String(), nil
+}
+
+func stripHTML(html string) string {
+ var sb strings.Builder
+ inTag := false
+ for _, r := range html {
+ switch r {
+ case '<':
+ inTag = true
+ case '>':
+ inTag = false
+ default:
+ if !inTag {
+ sb.WriteRune(r)
+ }
+ }
+ }
+ return sb.String()
+}
+
+func extractTextFromPdf(fpath string) (string, error) {
+ _, err := exec.LookPath("pdftotext")
+ if err == nil {
+ out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output()
+ if err == nil && len(out) > 0 {
+ return string(out), nil
+ }
+ }
+
+ return extractTextFromPdfPureGo(fpath)
+}
+
+func extractTextFromPdfPureGo(fpath string) (string, error) {
+ df, r, err := pdf.Open(fpath)
+ if err != nil {
+ return "", fmt.Errorf("failed to open pdf: %w", err)
+ }
+ defer df.Close()
+
+ textReader, err := r.GetPlainText()
+ if err != nil {
+ return "", fmt.Errorf("failed to extract text from pdf: %w", err)
+ }
+
+ var buf bytes.Buffer
+ _, err = io.Copy(&buf, textReader)
+ if err != nil {
+ return "", fmt.Errorf("failed to read pdf text: %w", err)
+ }
+
+ return buf.String(), nil
+}
diff --git a/rag/rag.go b/rag/rag.go
index b49bd97..b8b5447 100644
--- a/rag/rag.go
+++ b/rag/rag.go
@@ -7,7 +7,6 @@ import (
"gf-lt/models"
"gf-lt/storage"
"log/slog"
- "os"
"path"
"regexp"
"sort"
@@ -58,7 +57,7 @@ func wordCounter(sentence string) int {
func (r *RAG) LoadRAG(fpath string) error {
r.mu.Lock()
defer r.mu.Unlock()
- data, err := os.ReadFile(fpath)
+ fileText, err := ExtractText(fpath)
if err != nil {
return err
}
@@ -68,7 +67,6 @@ func (r *RAG) LoadRAG(fpath string) error {
default:
r.logger.Warn("LongJobStatusCh channel is full or closed, dropping status message", "message", LoadedFileRAGStatus)
}
- fileText := string(data)
tokenizer, err := english.NewSentenceTokenizer(nil)
if err != nil {
return err