diff options
Diffstat (limited to 'extract_text_old.sh')
-rwxr-xr-x | extract_text_old.sh | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/extract_text_old.sh b/extract_text_old.sh new file mode 100755 index 0000000..810423e --- /dev/null +++ b/extract_text_old.sh @@ -0,0 +1,43 @@ +#!/bin/sh + +set -e + +filename=${1:-} +lang=${2:-deu} + +[ -z "$filename" ] && echo "no filename provided" && exit 1 + +mkdir -p frames subs result + +# split video on frames (every 90 seconds) +ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg + +yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg + +# convert each frame to grayscale then crop to subs if any +for img in $(ls -1 frames) +do + basename=$(echo $img | sed "s/.jpg//") + sub_img_name="frames/sub_$basename.jpg" + sub_out="subs/$basename" + magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \ + -colorspace Gray $sub_img_name + # dpi=$(magick identify -format '%x' $sub_img_name) + # echo "$sub_img_name - dpi: $dpi" + # dpi=300 + + echo $sub_img_name + + # call tesseract to get the subs + # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out + tesseract -l $lang $sub_img_name $sub_out +done + +filename=$(basename "$filename") +cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt +cd ../ + +./remove_dups.py subs result/"$filename"_out + +# rm -rf subs +# rm -rf frames |