summaryrefslogtreecommitdiff
path: root/extract_text_old.sh
diff options
context:
space:
mode:
Diffstat (limited to 'extract_text_old.sh')
-rwxr-xr-xextract_text_old.sh43
1 files changed, 43 insertions, 0 deletions
diff --git a/extract_text_old.sh b/extract_text_old.sh
new file mode 100755
index 0000000..810423e
--- /dev/null
+++ b/extract_text_old.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+
+set -e
+
+filename=${1:-}
+lang=${2:-deu}
+
+[ -z "$filename" ] && echo "no filename provided" && exit 1
+
+mkdir -p frames subs result
+
+# split video on frames (every 90 seconds)
+ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg
+
+yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
+
+# convert each frame to grayscale then crop to subs if any
+for img in $(ls -1 frames)
+do
+ basename=$(echo $img | sed "s/.jpg//")
+ sub_img_name="frames/sub_$basename.jpg"
+ sub_out="subs/$basename"
+ magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \
+ -colorspace Gray $sub_img_name
+ # dpi=$(magick identify -format '%x' $sub_img_name)
+ # echo "$sub_img_name - dpi: $dpi"
+ # dpi=300
+
+ echo $sub_img_name
+
+ # call tesseract to get the subs
+ # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
+ tesseract -l $lang $sub_img_name $sub_out
+done
+
+filename=$(basename "$filename")
+cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt
+cd ../
+
+./remove_dups.py subs result/"$filename"_out
+
+# rm -rf subs
+# rm -rf frames