summaryrefslogtreecommitdiff
path: root/quality_check.py
blob: 6b55432aae23104889edaeb1e4db63b30f9fed7c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/env python

import sys
import enchant

textfile = sys.argv[1]

# use example: ./quality_check.py text/ffmpeg_2.txt

non_words = "\n,.;!?'"
ge_dict = enchant.Dict('de_DE')

def word_ratio(words):
    sane = [ge_dict.check(w) for w in words if w not in non_words]
    if len(sane) == 0:
        return 0, 0
    return sum(sane) / len(sane), len(sane)

def read_into_list(filename):
    with open(filename, "r") as lf:
        return lf.read().split()

if __name__ == "__main__":
    words = read_into_list(textfile)
    ratio, length = word_ratio(words)
    print(f"{ratio}:{length}:{textfile}")