From 7ab566ef1034520b0d8a74387db8b65968d7d1c0 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 23 Dec 2022 14:56:54 +0600 Subject: Initial commit --- extract_text.sh | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100755 extract_text.sh (limited to 'extract_text.sh') diff --git a/extract_text.sh b/extract_text.sh new file mode 100755 index 0000000..7b7174c --- /dev/null +++ b/extract_text.sh @@ -0,0 +1,46 @@ +#!/bin/sh + +set -e + +framespath=${1:-} +lang=${2:-deu} + +[ -z "$framespath" ] && echo "no framespath provided" && exit 1 + +mkdir -p text result + +# split video on frames (every 90 seconds) +# ffmpeg -i "$videopath" -r 0.011 frames/ffmpeg_%0d.jpg +# yes | ffmpeg -i "$videopath" -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg + +# lets say we dont know extention +# ext=".jpg" + +# convert each frame to grayscale then crop to text if any +for img in "$framespath"/* +do + iname=$(basename "$img") + sub_img_name="frames/sub_${iname}" + sub_out="text/$(echo "$iname" | sed 's/.\(png\|gif\|jpg\|jpeg\|bmp\)//')" + magick convert "frames/$img" -chop 220x0 -gravity East -chop 220x0 \ + -gravity South -chop 0x50 \ + -colorspace Gray -resize 600x "$sub_img_name" + # dpi=$(magick identify -format '%x' $sub_img_name) + # echo "$sub_img_name - dpi: $dpi" + # dpi=300 + + printf "\r%s" "$sub_img_name" + + # call tesseract to get the text + # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out + tesseract -l "$lang" "$sub_img_name" "$sub_out" +done + +resultfile=$(basename "$framespath") +find text -type f | sort -n -k 1.13,1.15 | xargs cat > "../result/${resultfile}.txt" +cd ../ + +./remove_dups.py text "result/${resultfile}_clean" + +# rm -rf text +# rm -rf frames -- cgit v1.2.3