summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2022-12-23 14:56:54 +0600
committerGrail Finder <wohilas@gmail.com>2022-12-23 14:56:54 +0600
commit7ab566ef1034520b0d8a74387db8b65968d7d1c0 (patch)
tree146d6ded9469237af9a5033f498cb0ca640da813
Initial commit
-rw-r--r--.gitignore1
-rw-r--r--.pngbin0 -> 1001859 bytes
-rwxr-xr-xcut_to_borders.sh45
-rwxr-xr-xextract_text.sh46
-rwxr-xr-xextract_text_old.sh43
-rwxr-xr-xparam_seeker.sh81
-rwxr-xr-xquality_check.py26
-rwxr-xr-xremove_dups.py113
8 files changed, 355 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2539602
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+tutorial/
diff --git a/.png b/.png
new file mode 100644
index 0000000..fc3b5c9
--- /dev/null
+++ b/.png
Binary files differ
diff --git a/cut_to_borders.sh b/cut_to_borders.sh
new file mode 100755
index 0000000..3de37a3
--- /dev/null
+++ b/cut_to_borders.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+set -e
+
+# get frames path from cli args
+framespath=${1:-}
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+# draw random pic
+mkdir -p gray
+imageName=$(find "$framespath" -type f | sort -R | tail -1)
+shavedName="gray/"$(basename "$imageName")
+
+printf "working with $imageName\r"
+
+# detect image border:
+increment=5
+x=150
+limitX=600
+y=50
+colorFuzz=.01
+# borderColor="#f7d9ac"
+borderColor="#ffffff"
+
+while true
+do
+ pixelColor=$(magick convert "$imageName" -crop 1x1+$x+$y -depth 8 txt:- | awk ' NR==2 {print $3}')
+ colorDiff=$(magick compare -metric RMSE xc:"$borderColor" xc:"$pixelColor" null: 2>&1 | sed "s/.*(\(.*\))/\1/")
+ # break condition success
+ echo "fuzz: $colorFuzz; diff: $colorDiff; point: $x:$y"
+ [ 1 -eq "$(echo "$colorFuzz > $colorDiff" | bc)" ] && echo "found border: $x:$y" \
+ && magick convert "$imageName" -colorspace Gray -shave "${x}x0" "$shavedName" \
+ && break
+ # break condition fail
+ [ "$x" -gt "$limitX" ] && echo "failed to find border" && break
+ # update point
+ x=$((x+increment))
+done
+
+# to functions
+
+# cut left border
+# cut right border
+# assume position
+
diff --git a/extract_text.sh b/extract_text.sh
new file mode 100755
index 0000000..7b7174c
--- /dev/null
+++ b/extract_text.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+set -e
+
+framespath=${1:-}
+lang=${2:-deu}
+
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+mkdir -p text result
+
+# split video on frames (every 90 seconds)
+# ffmpeg -i "$videopath" -r 0.011 frames/ffmpeg_%0d.jpg
+# yes | ffmpeg -i "$videopath" -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
+
+# lets say we dont know extention
+# ext=".jpg"
+
+# convert each frame to grayscale then crop to text if any
+for img in "$framespath"/*
+do
+ iname=$(basename "$img")
+ sub_img_name="frames/sub_${iname}"
+ sub_out="text/$(echo "$iname" | sed 's/.\(png\|gif\|jpg\|jpeg\|bmp\)//')"
+ magick convert "frames/$img" -chop 220x0 -gravity East -chop 220x0 \
+ -gravity South -chop 0x50 \
+ -colorspace Gray -resize 600x "$sub_img_name"
+ # dpi=$(magick identify -format '%x' $sub_img_name)
+ # echo "$sub_img_name - dpi: $dpi"
+ # dpi=300
+
+ printf "\r%s" "$sub_img_name"
+
+ # call tesseract to get the text
+ # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
+ tesseract -l "$lang" "$sub_img_name" "$sub_out"
+done
+
+resultfile=$(basename "$framespath")
+find text -type f | sort -n -k 1.13,1.15 | xargs cat > "../result/${resultfile}.txt"
+cd ../
+
+./remove_dups.py text "result/${resultfile}_clean"
+
+# rm -rf text
+# rm -rf frames
diff --git a/extract_text_old.sh b/extract_text_old.sh
new file mode 100755
index 0000000..810423e
--- /dev/null
+++ b/extract_text_old.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+
+set -e
+
+filename=${1:-}
+lang=${2:-deu}
+
+[ -z "$filename" ] && echo "no filename provided" && exit 1
+
+mkdir -p frames subs result
+
+# split video on frames (every 90 seconds)
+ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg
+
+yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
+
+# convert each frame to grayscale then crop to subs if any
+for img in $(ls -1 frames)
+do
+ basename=$(echo $img | sed "s/.jpg//")
+ sub_img_name="frames/sub_$basename.jpg"
+ sub_out="subs/$basename"
+ magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \
+ -colorspace Gray $sub_img_name
+ # dpi=$(magick identify -format '%x' $sub_img_name)
+ # echo "$sub_img_name - dpi: $dpi"
+ # dpi=300
+
+ echo $sub_img_name
+
+ # call tesseract to get the subs
+ # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
+ tesseract -l $lang $sub_img_name $sub_out
+done
+
+filename=$(basename "$filename")
+cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt
+cd ../
+
+./remove_dups.py subs result/"$filename"_out
+
+# rm -rf subs
+# rm -rf frames
diff --git a/param_seeker.sh b/param_seeker.sh
new file mode 100755
index 0000000..eabefa5
--- /dev/null
+++ b/param_seeker.sh
@@ -0,0 +1,81 @@
+#!/bin/sh
+
+set -e
+
+# get frames path from cli args
+framespath=${1:-}
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+# draw random pic
+mkdir -p test
+imageName=$(find "$framespath" -type f | sort -R | tail -1)
+shavedName="test/"$(basename "$imageName")
+
+echo "working with $imageName"
+
+# detect image border:
+increment=5
+x=150
+limitX=400
+y=50
+colorFuzz=.04
+borderColor="#f7d9ac"
+
+while true
+do
+ pixelColor=$(magick convert "$imageName" -crop 1x1+$x+$y -depth 8 txt:- | awk ' NR==2 {print $3}')
+ colorDiff=$(magick compare -metric RMSE xc:"$borderColor" xc:"$pixelColor" null: 2>&1 | sed "s/.*(\(.*\))/\1/")
+ # break condition success
+ echo "fuzz: $colorFuzz; diff: $colorDiff; point: $x:$y"
+ [ 1 -eq "$(echo "$colorFuzz > $colorDiff" | bc)" ] && echo "found border: $x:$y" \
+ && magick convert "$imageName" -colorspace Gray -shave "${x}x0" "$shavedName" \
+ && break
+ # break condition fail
+ [ "$x" -gt "$limitX" ] && echo "failed to find border" && break
+ # update point
+ x=$((x+increment))
+done
+
+# generate bunch of cut and gray images
+# two param types: scale and bottom crop
+scaleX=400
+scaleInc=20
+bottomCrop=10
+bottomInc=10
+
+# both limits are reached in 20 runs
+loopRunLimit=20
+loop=0
+
+# first change only scale
+# then only bottomCrop
+# then both
+while true
+do
+ # break condition
+ [ "$loop" -gt "$loopRunLimit" ] && break
+ resizeName="test/${shavedName}_${scaleX}_0.png"
+ chopName="test/${shavedName}_0_${bottomCrop}.png"
+ resizeChopName="test/${shavedName}_${scaleX}_${bottomCrop}.png"
+ magick convert "$shavedName" -adaptive-resize ${scaleX}x "$resizeName"
+ magick convert "$shavedName" -gravity South -chop 0x${bottomCrop} "$chopName"
+ magick convert "$shavedName" -gravity South -chop 0x${bottomCrop} -adaptive-resize ${scaleX}x "$resizeChopName"
+ scaleX=$((scaleX+scaleInc))
+ bottomCrop=$((bottomCrop+bottomInc))
+ loop=$((loop+1))
+done
+
+
+# convert every new image to text
+mkdir -p test_text
+for img in test/*
+do
+ outName=$(basename "$img")
+ tesseract -l deu "$img" "test_text/$outName"
+done
+
+# quality check text files
+for t in test_text/*
+do
+ ./quality_check.py "$t" >> quality_out
+done
diff --git a/quality_check.py b/quality_check.py
new file mode 100755
index 0000000..6b55432
--- /dev/null
+++ b/quality_check.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+import sys
+import enchant
+
+textfile = sys.argv[1]
+
+# use example: ./quality_check.py text/ffmpeg_2.txt
+
+non_words = "\n ,.;!?'"
+ge_dict = enchant.Dict('de_DE')
+
+def word_ratio(words):
+ sane = [ge_dict.check(w) for w in words if w not in non_words]
+ if len(sane) == 0:
+ return 0, 0
+ return sum(sane) / len(sane), len(sane)
+
+def read_into_list(filename):
+ with open(filename, "r") as lf:
+ return lf.read().split()
+
+if __name__ == "__main__":
+ words = read_into_list(textfile)
+ ratio, length = word_ratio(words)
+ print(f"{ratio}:{length}:{textfile}")
diff --git a/remove_dups.py b/remove_dups.py
new file mode 100755
index 0000000..cc94416
--- /dev/null
+++ b/remove_dups.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+import os
+import glob
+import enchant
+from difflib import SequenceMatcher as SM
+
+directory = os.sys.argv[1]
+outname = f"result_out"
+
+if len(os.sys.argv) > 2:
+ outname = os.sys.argv[2]
+
+# use example: ./remove_dups.py text result/v12_clean.txt
+
+ge_dict = enchant.Dict('de_DE')
+gibberish_lines = []
+
+
+def clear_start(lines):
+ text_index = find_first_text_line_index(lines)
+ ratio = gibberish_ratio(lines[text_index])
+ if ratio < .8: # arbitrary
+ # too much gibberish, skip
+ gibberish_lines.append(lines[text_index])
+ return lines[text_index+1:]
+ # return lines starting with first text line
+ return lines[text_index:]
+
+
+def gibberish_ratio(line):
+ words = line.split(" ")
+ # list of bool: [True, False, True...]
+ sane = [ge_dict.check(w) for w in words if w != ""]
+ return sum(sane) / len(sane)
+
+
+def find_first_text_line_index(lines):
+ for i, l in enumerate(lines):
+ # search for first proper line
+ words = l.split(" ")
+ words = [w for w in words if w not in ("\n", "\r", " ")]
+ if len(words) > 1:
+ # found line with text
+ return i
+ # unexpected case
+ return 0
+
+
+# checks if current_line exists in previous page
+def in_prev_lines(prev_lines, current_line):
+ if current_line == " ":
+ return True # delete ^L char lines
+ if len(prev_lines) == 0:
+ return False # no prev lines given
+ if current_line == "\n" and prev_lines[-1] == "\n":
+ return True # delete \n if two in a row
+ for l in prev_lines:
+ if l == "\n":
+ continue
+ ratio = SM(None, l, current_line).ratio()
+ if ratio > .9: # close enough
+ return True
+ return False
+
+
+# returns lines without duplicates
+def get_uniq_lines(prev_lines, curr_lines):
+ uniq_lines = []
+ for cl in curr_lines:
+ if not in_prev_lines(prev_lines, cl):
+ uniq_lines.append(cl)
+ return uniq_lines
+
+
+def sortKeyFunc(s):
+ # ffmpeg_
+ # 012345
+ # gray_123
+ # .txt
+ # 1234
+ # return int(os.path.basename(s)[5:-4])
+ return int(os.path.basename(s)[7:-8])
+
+
+if __name__ == "__main__":
+ files = glob.glob(f"{directory}/*")
+ files.sort(key=sortKeyFunc)
+
+ prev_page = []
+ page_sep = "-"*60 + "\n"
+
+ with open(outname, "w") as wf:
+ for filename in files:
+ with open(filename) as rf:
+ lines = rf.readlines()
+ lines = clear_start(lines)
+ lines.reverse()
+ lines = clear_start(lines)
+ lines.reverse()
+ lines.append(page_sep)
+
+ # compare with prev data
+ new_page = get_uniq_lines(prev_page, lines)
+
+ # write to file
+ wf.writelines(new_page)
+
+ # cache to prev data
+ prev_page = new_page
+
+ with open("skipped_gibberish", "w") as lf:
+ lf.writelines(gibberish_lines)