diff options
| author | Grail Finder <wohilas@gmail.com> | 2026-02-25 06:51:02 +0300 |
|---|---|---|
| committer | Grail Finder <wohilas@gmail.com> | 2026-02-25 06:51:02 +0300 |
| commit | e0c3fe554fe5057891962234076d061ca58f694e (patch) | |
| tree | 05fe1e7588995ff4117e707fe6c546004c8a54af /rag | |
| parent | 40943ff4d34a7d77ece2c8e3a427b1cf18eccd6a (diff) | |
Feat: rag text extractors
Diffstat (limited to 'rag')
| -rw-r--r-- | rag/extractors.go | 153 | ||||
| -rw-r--r-- | rag/rag.go | 4 |
2 files changed, 154 insertions, 3 deletions
diff --git a/rag/extractors.go b/rag/extractors.go new file mode 100644 index 0000000..fcc6a2a --- /dev/null +++ b/rag/extractors.go @@ -0,0 +1,153 @@ +package rag + +import ( + "bytes" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path" + "strings" + + "github.com/huantt/plaintext-extractor" + "github.com/ledongthuc/pdf" + "github.com/n3integration/epub" +) + +func ExtractText(fpath string) (string, error) { + ext := strings.ToLower(path.Ext(fpath)) + + switch ext { + case ".txt": + return extractTextFromFile(fpath) + case ".md", ".markdown": + return extractTextFromMarkdown(fpath) + case ".html", ".htm": + return extractTextFromHtml(fpath) + case ".epub": + return extractTextFromEpub(fpath) + case ".pdf": + return extractTextFromPdf(fpath) + default: + return "", fmt.Errorf("unsupported file format: %s", ext) + } +} + +func extractTextFromFile(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + return string(data), nil +} + +func extractTextFromMarkdown(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + extractor := plaintext.NewMarkdownExtractor() + text, err := extractor.PlainText(string(data)) + if err != nil { + return "", err + } + return *text, nil +} + +func extractTextFromHtml(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + extractor := plaintext.NewHtmlExtractor() + text, err := extractor.PlainText(string(data)) + if err != nil { + return "", err + } + return *text, nil +} + +func extractTextFromEpub(fpath string) (string, error) { + book, err := epub.Open(fpath) + if err != nil { + return "", fmt.Errorf("failed to open epub: %w", err) + } + defer book.Close() + + var sb strings.Builder + + err = book.Each(func(title string, xhtml io.ReadCloser) { + if sb.Len() > 0 { + sb.WriteString("\n\n") + } + sb.WriteString(title) + sb.WriteString("\n") + + buf, readErr := io.ReadAll(xhtml) + if readErr == nil { + sb.WriteString(stripHTML(string(buf))) + } + }) + + if err != nil { + return "", fmt.Errorf("failed to iterate epub chapters: %w", err) + } + + if sb.Len() == 0 { + return "", errors.New("no content extracted from epub") + } + + return sb.String(), nil +} + +func stripHTML(html string) string { + var sb strings.Builder + inTag := false + for _, r := range html { + switch r { + case '<': + inTag = true + case '>': + inTag = false + default: + if !inTag { + sb.WriteRune(r) + } + } + } + return sb.String() +} + +func extractTextFromPdf(fpath string) (string, error) { + _, err := exec.LookPath("pdftotext") + if err == nil { + out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output() + if err == nil && len(out) > 0 { + return string(out), nil + } + } + + return extractTextFromPdfPureGo(fpath) +} + +func extractTextFromPdfPureGo(fpath string) (string, error) { + df, r, err := pdf.Open(fpath) + if err != nil { + return "", fmt.Errorf("failed to open pdf: %w", err) + } + defer df.Close() + + textReader, err := r.GetPlainText() + if err != nil { + return "", fmt.Errorf("failed to extract text from pdf: %w", err) + } + + var buf bytes.Buffer + _, err = io.Copy(&buf, textReader) + if err != nil { + return "", fmt.Errorf("failed to read pdf text: %w", err) + } + + return buf.String(), nil +} @@ -7,7 +7,6 @@ import ( "gf-lt/models" "gf-lt/storage" "log/slog" - "os" "path" "regexp" "sort" @@ -58,7 +57,7 @@ func wordCounter(sentence string) int { func (r *RAG) LoadRAG(fpath string) error { r.mu.Lock() defer r.mu.Unlock() - data, err := os.ReadFile(fpath) + fileText, err := ExtractText(fpath) if err != nil { return err } @@ -68,7 +67,6 @@ func (r *RAG) LoadRAG(fpath string) error { default: r.logger.Warn("LongJobStatusCh channel is full or closed, dropping status message", "message", LoadedFileRAGStatus) } - fileText := string(data) tokenizer, err := english.NewSentenceTokenizer(nil) if err != nil { return err |
