diff options
Diffstat (limited to 'remove_dups.py')
-rwxr-xr-x | remove_dups.py | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/remove_dups.py b/remove_dups.py new file mode 100755 index 0000000..cc94416 --- /dev/null +++ b/remove_dups.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +import os +import glob +import enchant +from difflib import SequenceMatcher as SM + +directory = os.sys.argv[1] +outname = f"result_out" + +if len(os.sys.argv) > 2: + outname = os.sys.argv[2] + +# use example: ./remove_dups.py text result/v12_clean.txt + +ge_dict = enchant.Dict('de_DE') +gibberish_lines = [] + + +def clear_start(lines): + text_index = find_first_text_line_index(lines) + ratio = gibberish_ratio(lines[text_index]) + if ratio < .8: # arbitrary + # too much gibberish, skip + gibberish_lines.append(lines[text_index]) + return lines[text_index+1:] + # return lines starting with first text line + return lines[text_index:] + + +def gibberish_ratio(line): + words = line.split(" ") + # list of bool: [True, False, True...] + sane = [ge_dict.check(w) for w in words if w != ""] + return sum(sane) / len(sane) + + +def find_first_text_line_index(lines): + for i, l in enumerate(lines): + # search for first proper line + words = l.split(" ") + words = [w for w in words if w not in ("\n", "\r", "")] + if len(words) > 1: + # found line with text + return i + # unexpected case + return 0 + + +# checks if current_line exists in previous page +def in_prev_lines(prev_lines, current_line): + if current_line == "": + return True # delete ^L char lines + if len(prev_lines) == 0: + return False # no prev lines given + if current_line == "\n" and prev_lines[-1] == "\n": + return True # delete \n if two in a row + for l in prev_lines: + if l == "\n": + continue + ratio = SM(None, l, current_line).ratio() + if ratio > .9: # close enough + return True + return False + + +# returns lines without duplicates +def get_uniq_lines(prev_lines, curr_lines): + uniq_lines = [] + for cl in curr_lines: + if not in_prev_lines(prev_lines, cl): + uniq_lines.append(cl) + return uniq_lines + + +def sortKeyFunc(s): + # ffmpeg_ + # 012345 + # gray_123 + # .txt + # 1234 + # return int(os.path.basename(s)[5:-4]) + return int(os.path.basename(s)[7:-8]) + + +if __name__ == "__main__": + files = glob.glob(f"{directory}/*") + files.sort(key=sortKeyFunc) + + prev_page = [] + page_sep = "-"*60 + "\n" + + with open(outname, "w") as wf: + for filename in files: + with open(filename) as rf: + lines = rf.readlines() + lines = clear_start(lines) + lines.reverse() + lines = clear_start(lines) + lines.reverse() + lines.append(page_sep) + + # compare with prev data + new_page = get_uniq_lines(prev_page, lines) + + # write to file + wf.writelines(new_page) + + # cache to prev data + prev_page = new_page + + with open("skipped_gibberish", "w") as lf: + lf.writelines(gibberish_lines) |