remove_dups.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

#!/usr/bin/env python

import os
import glob
import enchant
from difflib import SequenceMatcher as SM

directory = os.sys.argv[1]
outname = f"result_out"

if len(os.sys.argv) > 2:
    outname = os.sys.argv[2]

# use example: ./remove_dups.py text result/v12_clean.txt

ge_dict = enchant.Dict('de_DE')
gibberish_lines = []


def clear_start(lines):
    text_index = find_first_text_line_index(lines)
    ratio = gibberish_ratio(lines[text_index])
    if ratio < .8:  # arbitrary
        # too much gibberish, skip
        gibberish_lines.append(lines[text_index])
        return lines[text_index+1:]
    # return lines starting with first text line
    return lines[text_index:]


def gibberish_ratio(line):
    words = line.split(" ")
    # list of bool: [True, False, True...]
    sane = [ge_dict.check(w) for w in words if w != ""]
    return sum(sane) / len(sane)


def find_first_text_line_index(lines):
    for i, l in enumerate(lines):
        # search for first proper line
        words = l.split(" ")
        words = [w for w in words if w not in ("\n", "\r", "")]
        if len(words) > 1:
            # found line with text
            return i
    # unexpected case
    return 0


# checks if current_line exists in previous page
def in_prev_lines(prev_lines, current_line):
    if current_line == "":
        return True # delete ^L char lines
    if len(prev_lines) == 0:
        return False  # no prev lines given
    if current_line == "\n" and prev_lines[-1] == "\n":
        return True  # delete \n if two in a row
    for l in prev_lines:
        if l == "\n":
            continue
        ratio = SM(None, l, current_line).ratio()
        if ratio > .9:  # close enough
            return True
    return False


# returns lines without duplicates
def get_uniq_lines(prev_lines, curr_lines):
    uniq_lines = []
    for cl in curr_lines:
        if not in_prev_lines(prev_lines, cl):
            uniq_lines.append(cl)
    return uniq_lines


def sortKeyFunc(s):
    # ffmpeg_
    # 012345
    # gray_123
    # .txt
    # 1234
    # return int(os.path.basename(s)[5:-4])
    return int(os.path.basename(s)[7:-8])


if __name__ == "__main__":
    files = glob.glob(f"{directory}/*")
    # files.sort(key=sortKeyFunc)

    prev_page = []
    # page_sep = "-"*60 + "\n"

    with open(outname, "w") as wf:
        for filename in files:
            with open(filename) as rf:
                lines = rf.readlines()
                lines = clear_start(lines)
                lines.reverse()
                lines = clear_start(lines)
                lines.reverse()
                # lines.append(page_sep)

                # compare with prev data
                new_page = get_uniq_lines(prev_page, lines)

                # write to file
                wf.writelines(new_page)

                # cache to prev data
                prev_page = new_page

    with open("skipped_gibberish", "w") as lf:
        lf.writelines(gibberish_lines)