1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
#!/usr/bin/env python
import os
import glob
import enchant
from difflib import SequenceMatcher as SM
directory = os.sys.argv[1]
outname = f"result_out"
if len(os.sys.argv) > 2:
outname = os.sys.argv[2]
# use example: ./remove_dups.py text result/v12_clean.txt
ge_dict = enchant.Dict('de_DE')
gibberish_lines = []
def clear_start(lines):
text_index = find_first_text_line_index(lines)
ratio = gibberish_ratio(lines[text_index])
if ratio < .8: # arbitrary
# too much gibberish, skip
gibberish_lines.append(lines[text_index])
return lines[text_index+1:]
# return lines starting with first text line
return lines[text_index:]
def gibberish_ratio(line):
words = line.split(" ")
# list of bool: [True, False, True...]
sane = [ge_dict.check(w) for w in words if w != ""]
return sum(sane) / len(sane)
def find_first_text_line_index(lines):
for i, l in enumerate(lines):
# search for first proper line
words = l.split(" ")
words = [w for w in words if w not in ("\n", "\r", "")]
if len(words) > 1:
# found line with text
return i
# unexpected case
return 0
# checks if current_line exists in previous page
def in_prev_lines(prev_lines, current_line):
if current_line == "":
return True # delete ^L char lines
if len(prev_lines) == 0:
return False # no prev lines given
if current_line == "\n" and prev_lines[-1] == "\n":
return True # delete \n if two in a row
for l in prev_lines:
if l == "\n":
continue
ratio = SM(None, l, current_line).ratio()
if ratio > .9: # close enough
return True
return False
# returns lines without duplicates
def get_uniq_lines(prev_lines, curr_lines):
uniq_lines = []
for cl in curr_lines:
if not in_prev_lines(prev_lines, cl):
uniq_lines.append(cl)
return uniq_lines
def sortKeyFunc(s):
# ffmpeg_
# 012345
# gray_123
# .txt
# 1234
# return int(os.path.basename(s)[5:-4])
return int(os.path.basename(s)[7:-8])
if __name__ == "__main__":
files = glob.glob(f"{directory}/*")
# files.sort(key=sortKeyFunc)
prev_page = []
# page_sep = "-"*60 + "\n"
with open(outname, "w") as wf:
for filename in files:
with open(filename) as rf:
lines = rf.readlines()
lines = clear_start(lines)
lines.reverse()
lines = clear_start(lines)
lines.reverse()
# lines.append(page_sep)
# compare with prev data
new_page = get_uniq_lines(prev_page, lines)
# write to file
wf.writelines(new_page)
# cache to prev data
prev_page = new_page
with open("skipped_gibberish", "w") as lf:
lf.writelines(gibberish_lines)
|