#!/bin/sh set -e filename=${1:-} lang=${2:-deu} [ -z "$filename" ] && echo "no filename provided" && exit 1 mkdir -p frames subs result # split video on frames (every 90 seconds) ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg # convert each frame to grayscale then crop to subs if any for img in $(ls -1 frames) do basename=$(echo $img | sed "s/.jpg//") sub_img_name="frames/sub_$basename.jpg" sub_out="subs/$basename" magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \ -colorspace Gray $sub_img_name # dpi=$(magick identify -format '%x' $sub_img_name) # echo "$sub_img_name - dpi: $dpi" # dpi=300 echo $sub_img_name # call tesseract to get the subs # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out tesseract -l $lang $sub_img_name $sub_out done filename=$(basename "$filename") cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt cd ../ ./remove_dups.py subs result/"$filename"_out # rm -rf subs # rm -rf frames