From 7ab566ef1034520b0d8a74387db8b65968d7d1c0 Mon Sep 17 00:00:00 2001
From: Grail Finder <wohilas@gmail.com>
Date: Fri, 23 Dec 2022 14:56:54 +0600
Subject: Initial commit

---
 .gitignore          |   1 +
 .png                | Bin 0 -> 1001859 bytes
 cut_to_borders.sh   |  45 +++++++++++++++++++++
 extract_text.sh     |  46 +++++++++++++++++++++
 extract_text_old.sh |  43 ++++++++++++++++++++
 param_seeker.sh     |  81 +++++++++++++++++++++++++++++++++++++
 quality_check.py    |  26 ++++++++++++
 remove_dups.py      | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 355 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .png
 create mode 100755 cut_to_borders.sh
 create mode 100755 extract_text.sh
 create mode 100755 extract_text_old.sh
 create mode 100755 param_seeker.sh
 create mode 100755 quality_check.py
 create mode 100755 remove_dups.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2539602
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+tutorial/
diff --git a/.png b/.png
new file mode 100644
index 0000000..fc3b5c9
Binary files /dev/null and b/.png differ
diff --git a/cut_to_borders.sh b/cut_to_borders.sh
new file mode 100755
index 0000000..3de37a3
--- /dev/null
+++ b/cut_to_borders.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+set -e
+
+# get frames path from cli args
+framespath=${1:-}
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+# draw random pic
+mkdir -p gray
+imageName=$(find "$framespath" -type f | sort -R | tail -1)
+shavedName="gray/"$(basename "$imageName")
+
+printf "working with $imageName\r"
+
+# detect image border:
+increment=5
+x=150
+limitX=600
+y=50
+colorFuzz=.01
+# borderColor="#f7d9ac"
+borderColor="#ffffff"
+
+while true
+do
+    pixelColor=$(magick convert "$imageName" -crop 1x1+$x+$y -depth 8 txt:- | awk ' NR==2 {print $3}')
+    colorDiff=$(magick compare -metric RMSE xc:"$borderColor" xc:"$pixelColor" null: 2>&1 | sed "s/.*(\(.*\))/\1/")
+    # break condition success
+    echo "fuzz: $colorFuzz; diff: $colorDiff; point: $x:$y"
+    [ 1 -eq "$(echo "$colorFuzz > $colorDiff" | bc)" ] && echo "found border: $x:$y" \
+        && magick convert "$imageName" -colorspace Gray -shave "${x}x0" "$shavedName" \
+        && break
+    # break condition fail
+    [ "$x" -gt "$limitX" ] && echo "failed to find border" && break
+    # update point
+    x=$((x+increment))
+done
+
+# to functions
+
+# cut left border
+# cut right border
+# assume position
+
diff --git a/extract_text.sh b/extract_text.sh
new file mode 100755
index 0000000..7b7174c
--- /dev/null
+++ b/extract_text.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+set -e
+
+framespath=${1:-}
+lang=${2:-deu}
+
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+mkdir -p text result
+
+# split video on frames (every 90 seconds)
+# ffmpeg -i "$videopath" -r 0.011 frames/ffmpeg_%0d.jpg
+# yes | ffmpeg -i "$videopath" -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
+
+# lets say we dont know extention
+# ext=".jpg"
+
+# convert each frame to grayscale then crop to text if any
+for img in "$framespath"/*
+do
+    iname=$(basename "$img")
+    sub_img_name="frames/sub_${iname}"
+    sub_out="text/$(echo "$iname" | sed 's/.\(png\|gif\|jpg\|jpeg\|bmp\)//')"
+    magick convert "frames/$img" -chop 220x0 -gravity East -chop 220x0 \
+        -gravity South -chop 0x50 \
+        -colorspace Gray -resize 600x "$sub_img_name"
+    # dpi=$(magick identify -format '%x' $sub_img_name)
+    # echo "$sub_img_name - dpi: $dpi"
+    # dpi=300
+
+    printf "\r%s" "$sub_img_name"
+
+    # call tesseract to get the text
+    # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
+    tesseract -l "$lang" "$sub_img_name" "$sub_out"
+done
+
+resultfile=$(basename "$framespath")
+find text -type f | sort -n -k 1.13,1.15 | xargs cat > "../result/${resultfile}.txt"
+cd ../
+
+./remove_dups.py text "result/${resultfile}_clean"
+
+# rm -rf text
+# rm -rf frames
diff --git a/extract_text_old.sh b/extract_text_old.sh
new file mode 100755
index 0000000..810423e
--- /dev/null
+++ b/extract_text_old.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+
+set -e
+
+filename=${1:-}
+lang=${2:-deu}
+
+[ -z "$filename" ] && echo "no filename provided" && exit 1
+
+mkdir -p frames subs result
+
+# split video on frames (every 90 seconds)
+ffmpeg -i $filename -r 0.01 frames/ffmpeg_%0d.jpg
+
+yes | ffmpeg -i $filename -ss 00:00:05 -vframes 1 frames/ffmpeg_2.jpg
+
+# convert each frame to grayscale then crop to subs if any
+for img in $(ls -1 frames)
+do
+    basename=$(echo $img | sed "s/.jpg//")
+    sub_img_name="frames/sub_$basename.jpg"
+    sub_out="subs/$basename"
+    magick convert frames/$img -gravity West -chop 220x0 -gravity East -chop 220x0 \
+        -colorspace Gray $sub_img_name
+    # dpi=$(magick identify -format '%x' $sub_img_name)
+    # echo "$sub_img_name - dpi: $dpi"
+    # dpi=300
+
+    echo $sub_img_name
+
+    # call tesseract to get the subs
+    # tesseract -l $lang --dpi $dpi $sub_img_name $sub_out
+    tesseract -l $lang $sub_img_name $sub_out
+done
+
+filename=$(basename "$filename")
+cd subs && ls -1 . | sort -n -k 1.8,1.10 | xargs cat > ../result/"$filename".txt
+cd ../
+
+./remove_dups.py subs result/"$filename"_out
+
+# rm -rf subs
+# rm -rf frames
diff --git a/param_seeker.sh b/param_seeker.sh
new file mode 100755
index 0000000..eabefa5
--- /dev/null
+++ b/param_seeker.sh
@@ -0,0 +1,81 @@
+#!/bin/sh
+
+set -e
+
+# get frames path from cli args
+framespath=${1:-}
+[ -z "$framespath" ] && echo "no framespath provided" && exit 1
+
+# draw random pic
+mkdir -p test
+imageName=$(find "$framespath" -type f | sort -R | tail -1)
+shavedName="test/"$(basename "$imageName")
+
+echo "working with $imageName"
+
+# detect image border:
+increment=5
+x=150
+limitX=400
+y=50
+colorFuzz=.04
+borderColor="#f7d9ac"
+
+while true
+do
+    pixelColor=$(magick convert "$imageName" -crop 1x1+$x+$y -depth 8 txt:- | awk ' NR==2 {print $3}')
+    colorDiff=$(magick compare -metric RMSE xc:"$borderColor" xc:"$pixelColor" null: 2>&1 | sed "s/.*(\(.*\))/\1/")
+    # break condition success
+    echo "fuzz: $colorFuzz; diff: $colorDiff; point: $x:$y"
+    [ 1 -eq "$(echo "$colorFuzz > $colorDiff" | bc)" ] && echo "found border: $x:$y" \
+        && magick convert "$imageName" -colorspace Gray -shave "${x}x0" "$shavedName" \
+        && break
+    # break condition fail
+    [ "$x" -gt "$limitX" ] && echo "failed to find border" && break
+    # update point
+    x=$((x+increment))
+done
+
+# generate bunch of cut and gray images
+# two param types: scale and bottom crop
+scaleX=400
+scaleInc=20
+bottomCrop=10
+bottomInc=10
+
+# both limits are reached in 20 runs
+loopRunLimit=20
+loop=0
+
+# first change only scale
+# then only bottomCrop
+# then both
+while true
+do
+    # break condition
+    [ "$loop" -gt "$loopRunLimit" ] && break
+    resizeName="test/${shavedName}_${scaleX}_0.png"
+    chopName="test/${shavedName}_0_${bottomCrop}.png"
+    resizeChopName="test/${shavedName}_${scaleX}_${bottomCrop}.png"
+    magick convert "$shavedName" -adaptive-resize ${scaleX}x "$resizeName"
+    magick convert "$shavedName" -gravity South -chop 0x${bottomCrop} "$chopName"
+    magick convert "$shavedName" -gravity South -chop 0x${bottomCrop} -adaptive-resize ${scaleX}x "$resizeChopName"
+    scaleX=$((scaleX+scaleInc))
+    bottomCrop=$((bottomCrop+bottomInc))
+    loop=$((loop+1))
+done
+
+
+# convert every new image to text
+mkdir -p test_text
+for img in test/*
+do
+    outName=$(basename "$img")
+    tesseract -l deu "$img" "test_text/$outName"
+done
+
+# quality check text files
+for t in test_text/*
+do
+    ./quality_check.py "$t" >> quality_out
+done
diff --git a/quality_check.py b/quality_check.py
new file mode 100755
index 0000000..6b55432
--- /dev/null
+++ b/quality_check.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+import sys
+import enchant
+
+textfile = sys.argv[1]
+
+# use example: ./quality_check.py text/ffmpeg_2.txt
+
+non_words = "\n,.;!?'"
+ge_dict = enchant.Dict('de_DE')
+
+def word_ratio(words):
+    sane = [ge_dict.check(w) for w in words if w not in non_words]
+    if len(sane) == 0:
+        return 0, 0
+    return sum(sane) / len(sane), len(sane)
+
+def read_into_list(filename):
+    with open(filename, "r") as lf:
+        return lf.read().split()
+
+if __name__ == "__main__":
+    words = read_into_list(textfile)
+    ratio, length = word_ratio(words)
+    print(f"{ratio}:{length}:{textfile}")
diff --git a/remove_dups.py b/remove_dups.py
new file mode 100755
index 0000000..cc94416
--- /dev/null
+++ b/remove_dups.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+import os
+import glob
+import enchant
+from difflib import SequenceMatcher as SM
+
+directory = os.sys.argv[1]
+outname = f"result_out"
+
+if len(os.sys.argv) > 2:
+    outname = os.sys.argv[2]
+
+# use example: ./remove_dups.py text result/v12_clean.txt
+
+ge_dict = enchant.Dict('de_DE')
+gibberish_lines = []
+
+
+def clear_start(lines):
+    text_index = find_first_text_line_index(lines)
+    ratio = gibberish_ratio(lines[text_index])
+    if ratio < .8:  # arbitrary
+        # too much gibberish, skip
+        gibberish_lines.append(lines[text_index])
+        return lines[text_index+1:]
+    # return lines starting with first text line
+    return lines[text_index:]
+
+
+def gibberish_ratio(line):
+    words = line.split(" ")
+    # list of bool: [True, False, True...]
+    sane = [ge_dict.check(w) for w in words if w != ""]
+    return sum(sane) / len(sane)
+
+
+def find_first_text_line_index(lines):
+    for i, l in enumerate(lines):
+        # search for first proper line
+        words = l.split(" ")
+        words = [w for w in words if w not in ("\n", "\r", "")]
+        if len(words) > 1:
+            # found line with text
+            return i
+    # unexpected case
+    return 0
+
+
+# checks if current_line exists in previous page
+def in_prev_lines(prev_lines, current_line):
+    if current_line == "":
+        return True # delete ^L char lines
+    if len(prev_lines) == 0:
+        return False  # no prev lines given
+    if current_line == "\n" and prev_lines[-1] == "\n":
+        return True  # delete \n if two in a row
+    for l in prev_lines:
+        if l == "\n":
+            continue
+        ratio = SM(None, l, current_line).ratio()
+        if ratio > .9:  # close enough
+            return True
+    return False
+
+
+# returns lines without duplicates
+def get_uniq_lines(prev_lines, curr_lines):
+    uniq_lines = []
+    for cl in curr_lines:
+        if not in_prev_lines(prev_lines, cl):
+            uniq_lines.append(cl)
+    return uniq_lines
+
+
+def sortKeyFunc(s):
+    # ffmpeg_
+    # 012345
+    # gray_123
+    # .txt
+    # 1234
+    # return int(os.path.basename(s)[5:-4])
+    return int(os.path.basename(s)[7:-8])
+
+
+if __name__ == "__main__":
+    files = glob.glob(f"{directory}/*")
+    files.sort(key=sortKeyFunc)
+
+    prev_page = []
+    page_sep = "-"*60 + "\n"
+
+    with open(outname, "w") as wf:
+        for filename in files:
+            with open(filename) as rf:
+                lines = rf.readlines()
+                lines = clear_start(lines)
+                lines.reverse()
+                lines = clear_start(lines)
+                lines.reverse()
+                lines.append(page_sep)
+
+                # compare with prev data
+                new_page = get_uniq_lines(prev_page, lines)
+
+                # write to file
+                wf.writelines(new_page)
+
+                # cache to prev data
+                prev_page = new_page
+
+    with open("skipped_gibberish", "w") as lf:
+        lf.writelines(gibberish_lines)
-- 
cgit v1.2.3