blob: 810423e25662bbfc154fce0e480eb33c302c6acb (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#!/bin/sh
set -e
filename=${1:-}
lang=${2:-deu}
[ -z "$filename" ] && echo "no filename provided" && exit 1
mkdir -p frames subs result
# split video on frames (every 90 seconds)
ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg
yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
# convert each frame to grayscale then crop to subs if any
for img in $(ls -1 frames)
do
basename=$(echo $img | sed "s/.jpg//")
sub_img_name="frames/sub_$basename.jpg"
sub_out="subs/$basename"
magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \
-colorspace Gray $sub_img_name
# dpi=$(magick identify -format '%x' $sub_img_name)
# echo "$sub_img_name - dpi: $dpi"
# dpi=300
echo $sub_img_name
# call tesseract to get the subs
# tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
tesseract -l $lang $sub_img_name $sub_out
done
filename=$(basename "$filename")
cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt
cd ../
./remove_dups.py subs result/"$filename"_out
# rm -rf subs
# rm -rf frames
|