blob: bb22de27480891e903c863b04793c46fb70f4e9f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#!/bin/sh
set -e
currDir=$pwd
framespath=${1:-}
lang=${2:-deu}
[ -z "$framespath" ] && echo "no framespath provided" && exit 1
parentDIR=$(dirname "$framespath")
textDIR="$parentDIR"/text
grayDIR="$parentDIR"/gray
mkdir -p ${textDIR} ${grayDIR}
# split video on frames (every 90 seconds)
# ffmpeg -i "$videopath" -r 0.011 frames/ffmpeg_%0d.jpg
# yes | ffmpeg -i "$videopath" -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
# lets say we dont know extention
# ext=".jpg"
# convert each frame to grayscale then crop to text if any
for img in "$framespath"/*
do
iname=$(basename "$img")
sub_img_name=$"${grayDIR}/sub_${iname}"
sub_out="$textDIR/$(echo "$iname" | sed 's/.\(png\|gif\|jpg\|jpeg\|bmp\)//')"
magick convert "$img" -chop 220x0 -gravity East -chop 220x0 \
-gravity South -chop 0x50 \
-colorspace Gray -resize 600x "$sub_img_name"
# dpi=$(magick identify -format '%x' $sub_img_name)
# echo "$sub_img_name - dpi: $dpi"
# dpi=300
printf "\r%s" "$sub_img_name"
# call tesseract to get the text
# tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
tesseract -l "$lang" "$sub_img_name" "$sub_out"
done
resultfile=$(basename "$framespath")
find "$textDIR" -type f | sort -n | xargs cat > "${parentDIR}/${resultfile}.txt"
cd $currDir
./remove_dups.py "$textDIR" "${parentDIR}/${resultfile}_clean"
# rm -rf text
# rm -rf frames
|