blob: 6b55432aae23104889edaeb1e4db63b30f9fed7c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
#!/usr/bin/env python
import sys
import enchant
textfile = sys.argv[1]
# use example: ./quality_check.py text/ffmpeg_2.txt
non_words = "\n,.;!?'"
ge_dict = enchant.Dict('de_DE')
def word_ratio(words):
sane = [ge_dict.check(w) for w in words if w not in non_words]
if len(sane) == 0:
return 0, 0
return sum(sane) / len(sane), len(sane)
def read_into_list(filename):
with open(filename, "r") as lf:
return lf.read().split()
if __name__ == "__main__":
words = read_into_list(textfile)
ratio, length = word_ratio(words)
print(f"{ratio}:{length}:{textfile}")
|