From 7ab566ef1034520b0d8a74387db8b65968d7d1c0 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 23 Dec 2022 14:56:54 +0600 Subject: Initial commit --- .gitignore | 1 + .png | Bin 0 -> 1001859 bytes cut_to_borders.sh | 45 +++++++++++++++++++++ extract_text.sh | 46 +++++++++++++++++++++ extract_text_old.sh | 43 ++++++++++++++++++++ param_seeker.sh | 81 +++++++++++++++++++++++++++++++++++++ quality_check.py | 26 ++++++++++++ remove_dups.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 355 insertions(+) create mode 100644 .gitignore create mode 100644 .png create mode 100755 cut_to_borders.sh create mode 100755 extract_text.sh create mode 100755 extract_text_old.sh create mode 100755 param_seeker.sh create mode 100755 quality_check.py create mode 100755 remove_dups.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2539602 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +tutorial/ diff --git a/.png b/.png new file mode 100644 index 0000000..fc3b5c9 Binary files /dev/null and b/.png differ diff --git a/cut_to_borders.sh b/cut_to_borders.sh new file mode 100755 index 0000000..3de37a3 --- /dev/null +++ b/cut_to_borders.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +set -e + +# get frames path from cli args +framespath=${1:-} +[ -z "$framespath" ] && echo "no framespath provided" && exit 1 + +# draw random pic +mkdir -p gray +imageName=$(find "$framespath" -type f | sort -R | tail -1) +shavedName="gray/"$(basename "$imageName") + +printf "working with $imageName\r" + +# detect image border: +increment=5 +x=150 +limitX=600 +y=50 +colorFuzz=.01 +# borderColor="#f7d9ac" +borderColor="#ffffff" + +while true +do + pixelColor=$(magick convert "$imageName" -crop 1x1+$x+$y -depth 8 txt:- | awk ' NR==2 {print $3}') + colorDiff=$(magick compare -metric RMSE xc:"$borderColor" xc:"$pixelColor" null: 2>&1 | sed "s/.*(\(.*\))/\1/") + # break condition success + echo "fuzz: $colorFuzz; diff: $colorDiff; point: $x:$y" + [ 1 -eq "$(echo "$colorFuzz > $colorDiff" | bc)" ] && echo "found border: $x:$y" \ + && magick convert "$imageName" -colorspace Gray -shave "${x}x0" "$shavedName" \ + && break + # break condition fail + [ "$x" -gt "$limitX" ] && echo "failed to find border" && break + # update point + x=$((x+increment)) +done + +# to functions + +# cut left border +# cut right border +# assume position + diff --git a/extract_text.sh b/extract_text.sh new file mode 100755 index 0000000..7b7174c --- /dev/null +++ b/extract_text.sh @@ -0,0 +1,46 @@ +#!/bin/sh + +set -e + +framespath=${1:-} +lang=${2:-deu} + +[ -z "$framespath" ] && echo "no framespath provided" && exit 1 + +mkdir -p text result + +# split video on frames (every 90 seconds) +# ffmpeg -i "$videopath" -r 0.011 frames/ffmpeg_%0d.jpg +# yes | ffmpeg -i "$videopath" -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg + +# lets say we dont know extention +# ext=".jpg" + +# convert each frame to grayscale then crop to text if any +for img in "$framespath"/* +do + iname=$(basename "$img") + sub_img_name="frames/sub_${iname}" + sub_out="text/$(echo "$iname" | sed 's/.\(png\|gif\|jpg\|jpeg\|bmp\)//')" + magick convert "frames/$img" -chop 220x0 -gravity East -chop 220x0 \ + -gravity South -chop 0x50 \ + -colorspace Gray -resize 600x "$sub_img_name" + # dpi=$(magick identify -format '%x' $sub_img_name) + # echo "$sub_img_name - dpi: $dpi" + # dpi=300 + + printf "\r%s" "$sub_img_name" + + # call tesseract to get the text + # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out + tesseract -l "$lang" "$sub_img_name" "$sub_out" +done + +resultfile=$(basename "$framespath") +find text -type f | sort -n -k 1.13,1.15 | xargs cat > "../result/${resultfile}.txt" +cd ../ + +./remove_dups.py text "result/${resultfile}_clean" + +# rm -rf text +# rm -rf frames diff --git a/extract_text_old.sh b/extract_text_old.sh new file mode 100755 index 0000000..810423e --- /dev/null +++ b/extract_text_old.sh @@ -0,0 +1,43 @@ +#!/bin/sh + +set -e + +filename=${1:-} +lang=${2:-deu} + +[ -z "$filename" ] && echo "no filename provided" && exit 1 + +mkdir -p frames subs result + +# split video on frames (every 90 seconds) +ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg + +yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg + +# convert each frame to grayscale then crop to subs if any +for img in $(ls -1 frames) +do + basename=$(echo $img | sed "s/.jpg//") + sub_img_name="frames/sub_$basename.jpg" + sub_out="subs/$basename" + magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \ + -colorspace Gray $sub_img_name + # dpi=$(magick identify -format '%x' $sub_img_name) + # echo "$sub_img_name - dpi: $dpi" + # dpi=300 + + echo $sub_img_name + + # call tesseract to get the subs + # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out + tesseract -l $lang $sub_img_name $sub_out +done + +filename=$(basename "$filename") +cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt +cd ../ + +./remove_dups.py subs result/"$filename"_out + +# rm -rf subs +# rm -rf frames diff --git a/param_seeker.sh b/param_seeker.sh new file mode 100755 index 0000000..eabefa5 --- /dev/null +++ b/param_seeker.sh @@ -0,0 +1,81 @@ +#!/bin/sh + +set -e + +# get frames path from cli args +framespath=${1:-} +[ -z "$framespath" ] && echo "no framespath provided" && exit 1 + +# draw random pic +mkdir -p test +imageName=$(find "$framespath" -type f | sort -R | tail -1) +shavedName="test/"$(basename "$imageName") + +echo "working with $imageName" + +# detect image border: +increment=5 +x=150 +limitX=400 +y=50 +colorFuzz=.04 +borderColor="#f7d9ac" + +while true +do + pixelColor=$(magick convert "$imageName" -crop 1x1+$x+$y -depth 8 txt:- | awk ' NR==2 {print $3}') + colorDiff=$(magick compare -metric RMSE xc:"$borderColor" xc:"$pixelColor" null: 2>&1 | sed "s/.*(\(.*\))/\1/") + # break condition success + echo "fuzz: $colorFuzz; diff: $colorDiff; point: $x:$y" + [ 1 -eq "$(echo "$colorFuzz > $colorDiff" | bc)" ] && echo "found border: $x:$y" \ + && magick convert "$imageName" -colorspace Gray -shave "${x}x0" "$shavedName" \ + && break + # break condition fail + [ "$x" -gt "$limitX" ] && echo "failed to find border" && break + # update point + x=$((x+increment)) +done + +# generate bunch of cut and gray images +# two param types: scale and bottom crop +scaleX=400 +scaleInc=20 +bottomCrop=10 +bottomInc=10 + +# both limits are reached in 20 runs +loopRunLimit=20 +loop=0 + +# first change only scale +# then only bottomCrop +# then both +while true +do + # break condition + [ "$loop" -gt "$loopRunLimit" ] && break + resizeName="test/${shavedName}_${scaleX}_0.png" + chopName="test/${shavedName}_0_${bottomCrop}.png" + resizeChopName="test/${shavedName}_${scaleX}_${bottomCrop}.png" + magick convert "$shavedName" -adaptive-resize ${scaleX}x "$resizeName" + magick convert "$shavedName" -gravity South -chop 0x${bottomCrop} "$chopName" + magick convert "$shavedName" -gravity South -chop 0x${bottomCrop} -adaptive-resize ${scaleX}x "$resizeChopName" + scaleX=$((scaleX+scaleInc)) + bottomCrop=$((bottomCrop+bottomInc)) + loop=$((loop+1)) +done + + +# convert every new image to text +mkdir -p test_text +for img in test/* +do + outName=$(basename "$img") + tesseract -l deu "$img" "test_text/$outName" +done + +# quality check text files +for t in test_text/* +do + ./quality_check.py "$t" >> quality_out +done diff --git a/quality_check.py b/quality_check.py new file mode 100755 index 0000000..6b55432 --- /dev/null +++ b/quality_check.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +import sys +import enchant + +textfile = sys.argv[1] + +# use example: ./quality_check.py text/ffmpeg_2.txt + +non_words = "\n ,.;!?'" +ge_dict = enchant.Dict('de_DE') + +def word_ratio(words): + sane = [ge_dict.check(w) for w in words if w not in non_words] + if len(sane) == 0: + return 0, 0 + return sum(sane) / len(sane), len(sane) + +def read_into_list(filename): + with open(filename, "r") as lf: + return lf.read().split() + +if __name__ == "__main__": + words = read_into_list(textfile) + ratio, length = word_ratio(words) + print(f"{ratio}:{length}:{textfile}") diff --git a/remove_dups.py b/remove_dups.py new file mode 100755 index 0000000..cc94416 --- /dev/null +++ b/remove_dups.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +import os +import glob +import enchant +from difflib import SequenceMatcher as SM + +directory = os.sys.argv[1] +outname = f"result_out" + +if len(os.sys.argv) > 2: + outname = os.sys.argv[2] + +# use example: ./remove_dups.py text result/v12_clean.txt + +ge_dict = enchant.Dict('de_DE') +gibberish_lines = [] + + +def clear_start(lines): + text_index = find_first_text_line_index(lines) + ratio = gibberish_ratio(lines[text_index]) + if ratio < .8: # arbitrary + # too much gibberish, skip + gibberish_lines.append(lines[text_index]) + return lines[text_index+1:] + # return lines starting with first text line + return lines[text_index:] + + +def gibberish_ratio(line): + words = line.split(" ") + # list of bool: [True, False, True...] + sane = [ge_dict.check(w) for w in words if w != ""] + return sum(sane) / len(sane) + + +def find_first_text_line_index(lines): + for i, l in enumerate(lines): + # search for first proper line + words = l.split(" ") + words = [w for w in words if w not in ("\n", "\r", " ")] + if len(words) > 1: + # found line with text + return i + # unexpected case + return 0 + + +# checks if current_line exists in previous page +def in_prev_lines(prev_lines, current_line): + if current_line == " ": + return True # delete ^L char lines + if len(prev_lines) == 0: + return False # no prev lines given + if current_line == "\n" and prev_lines[-1] == "\n": + return True # delete \n if two in a row + for l in prev_lines: + if l == "\n": + continue + ratio = SM(None, l, current_line).ratio() + if ratio > .9: # close enough + return True + return False + + +# returns lines without duplicates +def get_uniq_lines(prev_lines, curr_lines): + uniq_lines = [] + for cl in curr_lines: + if not in_prev_lines(prev_lines, cl): + uniq_lines.append(cl) + return uniq_lines + + +def sortKeyFunc(s): + # ffmpeg_ + # 012345 + # gray_123 + # .txt + # 1234 + # return int(os.path.basename(s)[5:-4]) + return int(os.path.basename(s)[7:-8]) + + +if __name__ == "__main__": + files = glob.glob(f"{directory}/*") + files.sort(key=sortKeyFunc) + + prev_page = [] + page_sep = "-"*60 + "\n" + + with open(outname, "w") as wf: + for filename in files: + with open(filename) as rf: + lines = rf.readlines() + lines = clear_start(lines) + lines.reverse() + lines = clear_start(lines) + lines.reverse() + lines.append(page_sep) + + # compare with prev data + new_page = get_uniq_lines(prev_page, lines) + + # write to file + wf.writelines(new_page) + + # cache to prev data + prev_page = new_page + + with open("skipped_gibberish", "w") as lf: + lf.writelines(gibberish_lines) -- cgit v1.2.3