summaryrefslogtreecommitdiff
path: root/remove_dups.py
diff options
context:
space:
mode:
Diffstat (limited to 'remove_dups.py')
-rwxr-xr-xremove_dups.py113
1 files changed, 113 insertions, 0 deletions
diff --git a/remove_dups.py b/remove_dups.py
new file mode 100755
index 0000000..cc94416
--- /dev/null
+++ b/remove_dups.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+import os
+import glob
+import enchant
+from difflib import SequenceMatcher as SM
+
+directory = os.sys.argv[1]
+outname = f"result_out"
+
+if len(os.sys.argv) > 2:
+ outname = os.sys.argv[2]
+
+# use example: ./remove_dups.py text result/v12_clean.txt
+
+ge_dict = enchant.Dict('de_DE')
+gibberish_lines = []
+
+
+def clear_start(lines):
+ text_index = find_first_text_line_index(lines)
+ ratio = gibberish_ratio(lines[text_index])
+ if ratio < .8: # arbitrary
+ # too much gibberish, skip
+ gibberish_lines.append(lines[text_index])
+ return lines[text_index+1:]
+ # return lines starting with first text line
+ return lines[text_index:]
+
+
+def gibberish_ratio(line):
+ words = line.split(" ")
+ # list of bool: [True, False, True...]
+ sane = [ge_dict.check(w) for w in words if w != ""]
+ return sum(sane) / len(sane)
+
+
+def find_first_text_line_index(lines):
+ for i, l in enumerate(lines):
+ # search for first proper line
+ words = l.split(" ")
+ words = [w for w in words if w not in ("\n", "\r", " ")]
+ if len(words) > 1:
+ # found line with text
+ return i
+ # unexpected case
+ return 0
+
+
+# checks if current_line exists in previous page
+def in_prev_lines(prev_lines, current_line):
+ if current_line == " ":
+ return True # delete ^L char lines
+ if len(prev_lines) == 0:
+ return False # no prev lines given
+ if current_line == "\n" and prev_lines[-1] == "\n":
+ return True # delete \n if two in a row
+ for l in prev_lines:
+ if l == "\n":
+ continue
+ ratio = SM(None, l, current_line).ratio()
+ if ratio > .9: # close enough
+ return True
+ return False
+
+
+# returns lines without duplicates
+def get_uniq_lines(prev_lines, curr_lines):
+ uniq_lines = []
+ for cl in curr_lines:
+ if not in_prev_lines(prev_lines, cl):
+ uniq_lines.append(cl)
+ return uniq_lines
+
+
+def sortKeyFunc(s):
+ # ffmpeg_
+ # 012345
+ # gray_123
+ # .txt
+ # 1234
+ # return int(os.path.basename(s)[5:-4])
+ return int(os.path.basename(s)[7:-8])
+
+
+if __name__ == "__main__":
+ files = glob.glob(f"{directory}/*")
+ files.sort(key=sortKeyFunc)
+
+ prev_page = []
+ page_sep = "-"*60 + "\n"
+
+ with open(outname, "w") as wf:
+ for filename in files:
+ with open(filename) as rf:
+ lines = rf.readlines()
+ lines = clear_start(lines)
+ lines.reverse()
+ lines = clear_start(lines)
+ lines.reverse()
+ lines.append(page_sep)
+
+ # compare with prev data
+ new_page = get_uniq_lines(prev_page, lines)
+
+ # write to file
+ wf.writelines(new_page)
+
+ # cache to prev data
+ prev_page = new_page
+
+ with open("skipped_gibberish", "w") as lf:
+ lf.writelines(gibberish_lines)