#!/usr/bin/env python import os import glob import enchant from difflib import SequenceMatcher as SM directory = os.sys.argv[1] outname = f"result_out" if len(os.sys.argv) > 2: outname = os.sys.argv[2] # use example: ./remove_dups.py text result/v12_clean.txt ge_dict = enchant.Dict('de_DE') gibberish_lines = [] def clear_start(lines): text_index = find_first_text_line_index(lines) ratio = gibberish_ratio(lines[text_index]) if ratio < .8: # arbitrary # too much gibberish, skip gibberish_lines.append(lines[text_index]) return lines[text_index+1:] # return lines starting with first text line return lines[text_index:] def gibberish_ratio(line): words = line.split(" ") # list of bool: [True, False, True...] sane = [ge_dict.check(w) for w in words if w != ""] return sum(sane) / len(sane) def find_first_text_line_index(lines): for i, l in enumerate(lines): # search for first proper line words = l.split(" ") words = [w for w in words if w not in ("\n", "\r", " ")] if len(words) > 1: # found line with text return i # unexpected case return 0 # checks if current_line exists in previous page def in_prev_lines(prev_lines, current_line): if current_line == " ": return True # delete ^L char lines if len(prev_lines) == 0: return False # no prev lines given if current_line == "\n" and prev_lines[-1] == "\n": return True # delete \n if two in a row for l in prev_lines: if l == "\n": continue ratio = SM(None, l, current_line).ratio() if ratio > .9: # close enough return True return False # returns lines without duplicates def get_uniq_lines(prev_lines, curr_lines): uniq_lines = [] for cl in curr_lines: if not in_prev_lines(prev_lines, cl): uniq_lines.append(cl) return uniq_lines def sortKeyFunc(s): # ffmpeg_ # 012345 # gray_123 # .txt # 1234 # return int(os.path.basename(s)[5:-4]) return int(os.path.basename(s)[7:-8]) if __name__ == "__main__": files = glob.glob(f"{directory}/*") # files.sort(key=sortKeyFunc) prev_page = [] # page_sep = "-"*60 + "\n" with open(outname, "w") as wf: for filename in files: with open(filename) as rf: lines = rf.readlines() lines = clear_start(lines) lines.reverse() lines = clear_start(lines) lines.reverse() # lines.append(page_sep) # compare with prev data new_page = get_uniq_lines(prev_page, lines) # write to file wf.writelines(new_page) # cache to prev data prev_page = new_page with open("skipped_gibberish", "w") as lf: lf.writelines(gibberish_lines)