summaryrefslogtreecommitdiff
path: root/extract_text_old.sh
blob: 810423e25662bbfc154fce0e480eb33c302c6acb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/sh

set -e

filename=${1:-}
lang=${2:-deu}

[ -z "$filename" ] && echo "no filename provided" && exit 1

mkdir -p frames subs result

# split video on frames (every 90 seconds)
ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg

yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg

# convert each frame to grayscale then crop to subs if any
for img in $(ls -1 frames)
do
    basename=$(echo $img | sed "s/.jpg//")
    sub_img_name="frames/sub_$basename.jpg"
    sub_out="subs/$basename"
    magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \
        -colorspace Gray $sub_img_name
    # dpi=$(magick identify -format '%x' $sub_img_name)
    # echo "$sub_img_name - dpi: $dpi"
    # dpi=300

    echo $sub_img_name

    # call tesseract to get the subs
    # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
    tesseract -l $lang $sub_img_name $sub_out
done

filename=$(basename "$filename")
cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt
cd ../

./remove_dups.py subs result/"$filename"_out

# rm -rf subs
# rm -rf frames