1 files changed, 113 insertions, 0 deletions
diff --git a/remove_dups.py b/remove_dups.py
new file mode 100755
index 0000000..cc94416
--- /dev/null
+++ b/remove_dups.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+import os
+import glob
+import enchant
+from difflib import SequenceMatcher as SM
+
+directory = os.sys.argv[1]
+outname = f"result_out"
+
+if len(os.sys.argv) > 2:
+    outname = os.sys.argv[2]
+
+# use example: ./remove_dups.py text result/v12_clean.txt
+
+ge_dict = enchant.Dict('de_DE')
+gibberish_lines = []
+
+
+def clear_start(lines):
+    text_index = find_first_text_line_index(lines)
+    ratio = gibberish_ratio(lines[text_index])
+    if ratio < .8:  # arbitrary
+        # too much gibberish, skip
+        gibberish_lines.append(lines[text_index])
+        return lines[text_index+1:]
+    # return lines starting with first text line
+    return lines[text_index:]
+
+
+def gibberish_ratio(line):
+    words = line.split(" ")
+    # list of bool: [True, False, True...]
+    sane = [ge_dict.check(w) for w in words if w != ""]
+    return sum(sane) / len(sane)
+
+
+def find_first_text_line_index(lines):
+    for i, l in enumerate(lines):
+        # search for first proper line
+        words = l.split(" ")
+        words = [w for w in words if w not in ("\n", "\r", "")]
+        if len(words) > 1:
+            # found line with text
+            return i
+    # unexpected case
+    return 0
+
+
+# checks if current_line exists in previous page
+def in_prev_lines(prev_lines, current_line):
+    if current_line == "":
+        return True # delete ^L char lines
+    if len(prev_lines) == 0:
+        return False  # no prev lines given
+    if current_line == "\n" and prev_lines[-1] == "\n":
+        return True  # delete \n if two in a row
+    for l in prev_lines:
+        if l == "\n":
+            continue
+        ratio = SM(None, l, current_line).ratio()
+        if ratio > .9:  # close enough
+            return True
+    return False
+
+
+# returns lines without duplicates
+def get_uniq_lines(prev_lines, curr_lines):
+    uniq_lines = []
+    for cl in curr_lines:
+        if not in_prev_lines(prev_lines, cl):
+            uniq_lines.append(cl)
+    return uniq_lines
+
+
+def sortKeyFunc(s):
+    # ffmpeg_
+    # 012345
+    # gray_123
+    # .txt
+    # 1234
+    # return int(os.path.basename(s)[5:-4])
+    return int(os.path.basename(s)[7:-8])
+
+
+if __name__ == "__main__":
+    files = glob.glob(f"{directory}/*")
+    files.sort(key=sortKeyFunc)
+
+    prev_page = []
+    page_sep = "-"*60 + "\n"
+
+    with open(outname, "w") as wf:
+        for filename in files:
+            with open(filename) as rf:
+                lines = rf.readlines()
+                lines = clear_start(lines)
+                lines.reverse()
+                lines = clear_start(lines)
+                lines.reverse()
+                lines.append(page_sep)
+
+                # compare with prev data
+                new_page = get_uniq_lines(prev_page, lines)
+
+                # write to file
+                wf.writelines(new_page)
+
+                # cache to prev data
+                prev_page = new_page
+
+    with open("skipped_gibberish", "w") as lf:
+        lf.writelines(gibberish_lines)