summaryrefslogtreecommitdiff
path: root/extract_text.sh
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2022-12-23 14:56:54 +0600
committerGrail Finder <wohilas@gmail.com>2022-12-23 14:56:54 +0600
commit7ab566ef1034520b0d8a74387db8b65968d7d1c0 (patch)
tree146d6ded9469237af9a5033f498cb0ca640da813 /extract_text.sh
Initial commit
Diffstat (limited to 'extract_text.sh')
-rwxr-xr-xextract_text.sh46
1 files changed, 46 insertions, 0 deletions
diff --git a/extract_text.sh b/extract_text.sh
new file mode 100755
index 0000000..7b7174c
--- /dev/null
+++ b/extract_text.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+set -e
+
+framespath=${1:-}
+lang=${2:-deu}
+
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+mkdir -p text result
+
+# split video on frames (every 90 seconds)
+# ffmpeg -i "$videopath" -r 0.011 frames/ffmpeg_%0d.jpg
+# yes | ffmpeg -i "$videopath" -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
+
+# lets say we dont know extention
+# ext=".jpg"
+
+# convert each frame to grayscale then crop to text if any
+for img in "$framespath"/*
+do
+ iname=$(basename "$img")
+ sub_img_name="frames/sub_${iname}"
+ sub_out="text/$(echo "$iname" | sed 's/.\(png\|gif\|jpg\|jpeg\|bmp\)//')"
+ magick convert "frames/$img" -chop 220x0 -gravity East -chop 220x0 \
+ -gravity South -chop 0x50 \
+ -colorspace Gray -resize 600x "$sub_img_name"
+ # dpi=$(magick identify -format '%x' $sub_img_name)
+ # echo "$sub_img_name - dpi: $dpi"
+ # dpi=300
+
+ printf "\r%s" "$sub_img_name"
+
+ # call tesseract to get the text
+ # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
+ tesseract -l "$lang" "$sub_img_name" "$sub_out"
+done
+
+resultfile=$(basename "$framespath")
+find text -type f | sort -n -k 1.13,1.15 | xargs cat > "../result/${resultfile}.txt"
+cd ../
+
+./remove_dups.py text "result/${resultfile}_clean"
+
+# rm -rf text
+# rm -rf frames