A toolbox of scene text detection and recognition

FudanVIC Team

Last update: Dec 26, 2022

Related tags

Computer Vision FudanOCR

Overview

FudanOCR

This toolbox contains the implementations of the following papers:

Scene Text Telescope: Text-Focused Scene Image Super-Resolution [Chen et al., CVPR-21]
Zero-Shot Chinese Character Recognition with Stroke-Level Decomposition [Chen et al., IJCAI-21]

The README.md file in each folder contains the instruction about how to run the code

Comments

lmdb格式数据集转换代码可以提供一下嘛?


# -*- coding: utf-8 -*-
import argparse
import glob
import io
import os
import pathlib
import threading

import cv2 as cv
import lmdb
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from tqdm import tqdm

# plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文
# plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号

root_path = pathlib.Path('/root/autodl-tmp/hwdb')

output_path = os.path.join(root_path, pathlib.Path('lmdb'))
train_path = os.path.join(root_path, pathlib.Path('train_3755'))
val_path = os.path.join(root_path, pathlib.Path('test'))

characters = []

with open('../character-3755.txt', 'r', encoding='utf-8') as f:
    while True:
        line = f.readline()
        if not line:
            break
        char = line.strip()
        characters.append(char)


def write_cache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            if isinstance(v, bytes):
                # 图片类型为bytes
                txn.put(k.encode(), v)
            else:
                # 标签类型为str, 转为bytes
                txn.put(k.encode(), v.encode())  # 编码


def create_dataset(env, image_path, label, index):
    n_samples = len(image_path)
    # map_size=1073741824 定义最大空间是1GB
    cache = {}
    cnt = index + 1
    for idx in range(n_samples):
        # 读取图片路径和对应的标签
        image = image_path[idx]
        if not os.path.exists(image):
            print('%s does not exist' % image)
            continue
        with open(image, 'rb') as fs:
            image_bin = fs.read()
        # .mdb数据库文件保存了两种数据，一种是图片数据，一种是标签数据，它们各有其key
        image_key = 'image-%09d' % cnt
        label_key = 'label-%09d' % cnt
        cache[image_key] = image_bin
        cache[label_key] = label
        cnt += 1
    if len(cache) != 0:
        write_cache(env, cache)
    return n_samples


def show_image(samples):
    plt.figure(figsize=(20, 10))
    for pos, sample in enumerate(samples):
        plt.subplot(4, 5, pos + 1)
        plt.imshow(sample[0])
        # plt.title(sample[1])
        plt.xticks([])
        plt.yticks([])
        plt.axis("off")
    plt.show()


def lmdb_test(root):
    env = lmdb.open(
        root,
        max_readers=1,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False)

    if not env:
        print('cannot open lmdb from %s' % root)
        return

    with env.begin(write=False) as txn:
        n_samples = int(txn.get('num-samples'.encode()))

    with env.begin(write=False) as txn:
        samples = []
        for index in range(1, n_samples + 1):
            img_key = 'image-%09d' % index
            img_buf = txn.get(img_key.encode())
            buf = io.BytesIO()
            buf.write(img_buf)
            buf.seek(0)
            try:
                img = Image.open(buf)
            except IOError:
                print('Corrupted image for %d' % index)
                return
            label_key = 'label-%09d' % index
            label = str(txn.get(label_key.encode()).decode('utf-8'))
            print(n_samples, len(img.split()), label)
            samples.append([img, label])
            if index == 5:
                # show_image(samples)
                # samples = []
                break


def lmdb_init(directory, out, left, right):
    entries = characters[left:right]
    pbar = tqdm(entries)
    n_samples = 0

    # 计算所需内存空间
    character_count = len(entries)
    image_path = glob.glob(os.path.join(directory, entries[0], '*.png'))
    image_cnt = len(image_path)
    data_size_per_img = cv.imdecode(np.fromfile(image_path[0], dtype=np.uint8), cv.IMREAD_UNCHANGED).nbytes
    # 一个类中所有图片的字节数
    data_size = data_size_per_img * image_cnt
    # 所有类的图片字节数
    total_byte = 2 * data_size * character_count
    # 创建lmdb文件
    if not os.path.exists(out):
        os.makedirs(out)
    env = lmdb.open(out, map_size=total_byte)
    for dir_name in pbar:
        image_path = glob.glob(os.path.join(directory, dir_name, '*.png'))
        label = dir_name
        n_samples += create_dataset(env, image_path, label, n_samples)
        pbar.set_description(
            f'character[{left + 1}:{right}]: {label} | nSamples: {n_samples} | total_byte: {total_byte}byte | progressing')

    write_cache(env, {'num-samples': str(n_samples)})
    env.close()


def begin(mode, left, right, valid=False):
    if mode == 'train':
        path = os.path.join(output_path, pathlib.Path(mode + '_' + str(right)))
        if not valid:
            lmdb_init(train_path, path, left=left, right=right)
        else:
            print(f"show:{valid},path:{path}")
            lmdb_test(path)
    elif mode == 'test':
        path = os.path.join(output_path, pathlib.Path(mode + '_' + str(right - left)))
        if not valid:
            lmdb_init(val_path, path, left=left, right=right)
        else:
            print(f"show:{valid},path:{path}")
            lmdb_test(path)


class MyThread(threading.Thread):
    def __init__(self, mode, left, right, valid):
        threading.Thread.__init__(self)
        self.mode = mode
        self.left = left
        self.right = right
        self.valid = valid

    def run(self):
        begin(mode=self.mode, left=self.left, right=self.right, valid=self.valid)


if __name__ == '__main__':
    """
    train_500: 3755类前500个类[1,500] = [0, 500)
    train_1000: 3755类第501到1000类[501,1000] = [500, 1000)
    train_1500: 3755类第1001到1500类[1001,1500] = [1000, 1500)
    train_2000: 3755类第1501到2000类[1501,2000] = [1500, 2000)
    train_2755: 3755类第2001到2755类[2001,2755] = [2000, 2755)
    train_3755: 3755类第2756到3755类[2756,3755] = [2755, 3755)
    test_1000: 3755类后1000类[2756,3755] = [2755, 3755)
    """
    parser = argparse.ArgumentParser()

    parser.add_argument("--train", action="store_true", help="generate train lmdb")
    parser.add_argument("--test", action="store_true", help="generate test lmdb")
    parser.add_argument("--all", action="store_true", help="generate all lmdb")
    parser.add_argument("--show", action="store_true", help="show result")
    parser.add_argument("--start", type=int, default=0, help="class start from where,default 0")
    parser.add_argument("--end", type=int, default=3755, help="class end from where,default 3755")

    args = parser.parse_args()

    train = args.train
    test = args.test
    build_all = args.all
    start = args.start
    end = args.end
    show = args.show

    if train:
        print(f"args: mode=train, [start:end)=[{start}:{end})")
        begin(mode='train', left=start, right=end, valid=show)
    if test:
        print(f"args: mode=test, [start:end)=[{start}:{end})")
        begin(mode='test', left=start, right=end, valid=show)
    if build_all:
        s = [0, 500, 1000, 1500, 2000, 2755]
        step = [500, 500, 500, 500, 755, 1000]
        m = ['5*train', '1*test']
        threads = []
        threadLock = threading.Lock()
        mode_index = 0
        for i in range(len(m)):
            tmp = m[i].strip().split("*")
            for j in range(int(tmp[0])):
                if show:
                    begin(mode=tmp[1], left=s[mode_index], right=s[mode_index] + step[mode_index], valid=show)
                else:
                    thread = MyThread(mode=tmp[1], left=s[mode_index],
                                      right=s[mode_index] + step[mode_index], valid=show)
                    threads.append(thread)
                    thread.start()
                mode_index += 1

        for t in threads:
            t.join()

opened by yeahQing 5

【time】how long does it take to train？

【time】how long does it take to train？

【moran、aster】how to set recognition head to moran or aster？ and Do I need to retrain every time the recognition head is set?

thank you

opened by Lz-2019317 5
Dataset is not opening

I was trying to open the .mdb dataset files, but I could not access and read them. I have MS access, and I tried other options too. Could you help me out?

opened by kanika02 4
请问能否提供TR-PSNR/TR-SSIM需要用到的segmentation mask？

您好，您论文中所提到的自建指标TR-PSNR/TR-SSIM需要用到text region 的segmentation mask，但下载的数据中貌似没有包含这一部分的内容，请问您是否能提供用于计算指标所用到的mask呢，或者训练mask的UNet的具体训练参数（论文和补充材料中都没有对训练的具体环境和参数有完整描述）？

opened by Imalne 3
Checkpoint experiment directory is cleaned (all models destroyed) upon testing
Firstly there isn't full usage guidance for this so I'm making some assumptions based on the TextZoom repo this was adapted from, please advise if I'm better off using it differently!

To train the model I ran:

python main.py --batch_size=32 --STN --mask --exp_name louis --text_focus

which successfully put checkpoint.pth and model_best.pth in checkpoints/louis/, however when I went to test these it printed

Clean the old checkpoint louis

and then complained:

FileNotFoundError: [Errno 2] No such file or directory: './checkpoint/louis/checkpoint.pth'

...so the saved model state was destroyed by trying to test it... Am I calling it wrong somehow? I suspect this should only be done when training (not testing) a model of the same experiment name as a pre-existing one.
documentation
opened by lmmx 3
super resolution.py

Do we have to change these paths ( folder_GT and folder_GN)in super_resolution.py?

Also, can we find the PSNR and SSIM of all images and store them in a CSV? How to do that?

opened by kanika02 2
【block】class TBSRN:in the tbsrn.py file,what does it mean? Thank you very much!
def forward: ...... for i in range(self.srb_nums + 1): block[str(i + 2)] = getattr(self, 'block%d' % (i + 2))(block[str(i + 1)]) block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ ((block['1'] + block[str(self.srb_nums + 2)])) output = torch.tanh(block[str(self.srb_nums + 3)])

First quention: block means basic unit of tbsrn,right? i guess: in the class of TBSRN,block1 means conv1 ? block2-7 means TBSRN-n ? where n=srb_nums? but initinal param srb_nums euqals 5. block8 means subsampling block ?

Sencond quention: in the def forward: how do upsampling block implement？ i just saw the code: block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ ((block['1'] + block[str(self.srb_nums + 2)])) Thank you~~~~~~
opened by Lz-2019317 1
About weight_cross_entropy

This paper is quite impressive research.

I have a question about implementation.

In this paper, weight cross-entropy (Content-Aware Module) is employed for calculating loss between the prediction of SR image and gt. However, the code seems to calculate loss between the prediction of HR image and gt. https://github.com/FudanVI/FudanOCR/blob/342eb1c79ed23abe2ed944a28d7361adacf693af/scene-text-telescope/loss/text_focus_loss.py#L97
bug

opened by terryoo 1

Demo will not run due to tensor size mismatch

The demo won't run for me due to a tensor size mismatch. I'm calling it as:

python main.py --demo --demo_dir='./demo/' --resume='./checkpoint/louis/model_best.pth' --STN --mask --exp_name louis

The mismatch is arising on line 367 of interfaces/super_resolution.py at:

            images_sr = model(images_lr)

The traceback reports "got 1024 and 8192 (The offending index is 0)":

Namespace(STN=True, arch='tbsrn', batch_size=None, demo=True, demo_dir='./demo/', exp_name='louis', hd_u=32, mask=True, mixed=False, rec='crnn', resume='./checkpoint/louis/model_best.pth', srb=5, syn=False, test=False, test_data_dir='./dataset/mydata/test/easy', text_focus=False)
loading pre-trained model from ./checkpoint/louis/model_best.pth 
Total Parameters 3220992
loading pretrained crnn model from ./dataset/mydata/crnn.pth
  0%|                                                                                                                                                                                                                                                                                                                            | 0/3 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "main.py", line 40, in <module>
    main(config, args)
  File "main.py", line 13, in main
    Mission.demo()
  File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/interfaces/super_resolution.py", line 367, in demo
    images_sr = model(images_lr)
  File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 166, in forward
    return self.module(*inputs[0], **kwargs[0])
  File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/model/tbsrn.py", line 221, in forward
    block[str(i + 2)] = getattr(self, 'block%d' % (i + 2))(block[str(i + 1)])
  File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/model/tbsrn.py", line 255, in forward
    residual = self.feature_enhancer(residual)
  File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/model/tbsrn.py", line 85, in forward
    conv_feature = torch.cat([conv_feature, position2d],1) # batch, 128(64+64), 32, 128
RuntimeError: Sizes of tensors must match except in dimension 2. Got 1024 and 8192 (The offending index is 0)

This pretrained model is the one provided in Dropbox (have you switched to a different model since uploading perhaps?)

bug

opened by lmmx 1

lmdb.Error: ./data/mydata/train_data.mdb: Not a directory

Hello, my experiment has been this error， lmdb.Error: ./data/mydata/train_data.mdb: Not a directory. Have you ever come across such a situation? How can I solve it? I would appreciate it if you could help me.

opened by cptbtptp125 0
How do I convert HWDB data sets to LMDB format?

Hello, I have tried many methods, but failed to convert HWDB data set, could you please provide how to convert HWDB to LMDB?

I will appreciate it very much.

opened by cptbtptp125 0
How is the.pkl file generated in print_font_template in Stroke-level-decomposition?

How is the.pkl file generated in print_font_template in Stroke-level-decomposition? Is there any corresponding support sample for stroke level recognition in Korean?I would appreciate it if you could help me.

opened by cptbtptp125 0

Why can't I reproduce the results in stroke-level-decomposition?

Hi, everyone! First of all, I'm very excited that you guys can share your works. I ran your code in default config, only the datasets are generated by myself. Unfortunately, I am getting bad results on train_500 and test_1000, the results are shown below. I don't know why would cause this problem, the accuracy is only have around 0.5% after 50 epoch.

Tips: epoch is a display error.

Epoch : 0 | ACC : 0.00011712148844680174
Epoch : 0 | ACC : 0.003697692706677598
Epoch : 0 | ACC : 0.003998862248397945
Epoch : 0 | ACC : 0.0030284270584101593
Epoch : 0 | ACC : 0.002325698127729349
Epoch : 0 | ACC : 0.005103150568039219
Epoch : 0 | ACC : 0.004818712667525558
Epoch : 0 | ACC : 0.004701591179078756
Epoch : 0 | ACC : 0.0047685177439055
Epoch : 0 | ACC : 0.005036224003212475
Epoch : 0 | ACC : 0.0052202720564860205
Epoch : 0 | ACC : 0.0052202720564860205
Epoch : 0 | ACC : 0.005103150568039219
Epoch : 0 | ACC : 0.005203540415279335
Epoch : 0 | ACC : 0.0052202720564860205
Epoch : 0 | ACC : 0.005136613850452591
Epoch : 0 | ACC : 0.005337393544932822
Epoch : 0 | ACC : 0.005052955644419161
Epoch : 0 | ACC : 0.005153345491659277
Epoch : 0 | ACC : 0.005203540415279335
Epoch : 0 | ACC : 0.004952565797179045
Epoch : 0 | ACC : 0.0052202720564860205
Epoch : 0 | ACC : 0.005186808774072649
Epoch : 0 | ACC : 0.004952565797179045
Epoch : 0 | ACC : 0.00538758846855288
Epoch : 0 | ACC : 0.005504709956999682
Epoch : 0 | ACC : 0.0051700771328659625
Epoch : 0 | ACC : 0.005521441598206368
Epoch : 0 | ACC : 0.005203540415279335
Epoch : 0 | ACC : 0.005504709956999682
Epoch : 0 | ACC : 0.005186808774072649
Epoch : 0 | ACC : 0.005186808774072649
Epoch : 0 | ACC : 0.005421051750966252
Epoch : 0 | ACC : 0.005571636521826426
Epoch : 0 | ACC : 0.005036224003212475
Epoch : 0 | ACC : 0.00538758846855288
Epoch : 0 | ACC : 0.005186808774072649
Epoch : 0 | ACC : 0.005237003697692707
Epoch : 0 | ACC : 0.004952565797179045
Epoch : 0 | ACC : 0.005153345491659277
Epoch : 0 | ACC : 0.0051700771328659625
Epoch : 0 | ACC : 0.005153345491659277
Epoch : 0 | ACC : 0.005086418926832533
Epoch : 0 | ACC : 0.004986029079592417

opened by yeahQing 5

transformer预训练模型是如何训练的？

作者您好！以我目前的浅薄理解，训练的过程是transformer模型直接加载笔划权重pretrain_transformer_stroke_decomposition.pth，计算sr图与hr图的结果，l1loss回传给生成模型，预测过程是lr图经过生成模型获取sr图，使用crnn直接预测结果吗？transformer模型的参数在中途是不是不变啊，笔划部分是如何训练的呢？

opened by dzyanshan 0
How can I eveluate TBSRN on Aster and Moran?

I prepare pretrain model of Aster and Moran, but there are no Aster/Moran files in this repository. I directly copy the corrspondings in TextZoom to the correct path but face with running issues.

opened by MinghaoFu 0

Owner

FudanVIC Team

Visual Intelligence & Cognition Team at Fudan University

GitHub

Motion detector, Full body detection, Upper body detection, Cat face detection, Smile detection, Face detection (haar cascade), Silverware detection, Face detection (lbp), and Sending email notifications

Security camera running OpenCV for object and motion detection. The camera will send email with image of any objects it detects. It also runs a server that provides web interface with live stream video.

10 Jun 30, 2021

OpenMMLab Text Detection, Recognition and Understanding Toolbox

Introduction English | 简体中文 MMOCR is an open-source toolbox based on PyTorch and mmdetection for text detection, text recognition, and the correspondi

3k Jan 7, 2023

A curated list of papers and resources for scene text detection and recognition

Awesome Scene Text A curated list of papers and resources for scene text detection and recognition The year when a paper was first published, includin

43 Mar 15, 2022

End-to-end pipeline for real-time scene text detection and recognition.

Real-time-Scene-Text-Detection-and-Recognition-System End-to-end pipeline for real-time scene text detection and recognition. The detection model use

89 Aug 4, 2022

Scene text detection and recognition based on Extremal Region(ER)

Scene text recognition A real-time scene text recognition algorithm. Our system is able to recognize text in unconstrain background. This algorithm is

155 Dec 6, 2022

Tracking the latest progress in Scene Text Detection and Recognition: Must-read papers well organized

SceneTextPapers Tracking the latest progress in Scene Text Detection and Recognition: must-read papers well organized Information about this repositor

763 Jan 1, 2023

OCR, Scene-Text-Understanding, Text Recognition

Scene-Text-Understanding Survey [2015-PAMI] Text Detection and Recognition in Imagery: A Survey paper [2014-Front.Comput.Sci] Scene Text Detection and

354 Dec 12, 2022

This project modify tensorflow object detection api code to predict oriented bounding boxes. It can be used for scene text detection.

This is an oriented object detector based on tensorflow object detection API. Most of the code is not changed except for those related to the need of

30 Oct 22, 2022

A novel region proposal network for more general object detection ( including scene text detection ).

DeRPN: Taking a further step toward more general object detection DeRPN is a novel region proposal network which concentrates on improving the adaptiv

Deep Learning and Vision Computing Lab, SCUT

151 Dec 12, 2022

An Implementation of the alogrithm in paper IncepText: A New Inception-Text Module with Deformable PSROI Pooling for Multi-Oriented Scene Text Detection

InceptText-Tensorflow An Implementation of the alogrithm in paper IncepText: A New Inception-Text Module with Deformable PSROI Pooling for Multi-Orien

115 Dec 12, 2022

AdvancedEAST is an algorithm used for Scene image text detect, which is primarily based on EAST, and the significant improvement was also made, which make long text predictions more accurate.https://github.com/huoyijie/raspberrypi-car

AdvancedEAST AdvancedEAST is an algorithm used for Scene image text detect, which is primarily based on EAST:An Efficient and Accurate Scene Text Dete

1.2k Dec 29, 2022

A toolbox of scene text detection and recognition

Related tags

Overview

FudanOCR

Comments

Owner

FudanVIC Team

Motion detector, Full body detection, Upper body detection, Cat face detection, Smile detection, Face detection (haar cascade), Silverware detection, Face detection (lbp), and Sending email notifications

OpenMMLab Text Detection, Recognition and Understanding Toolbox

A curated list of papers and resources for scene text detection and recognition

End-to-end pipeline for real-time scene text detection and recognition.

Scene text detection and recognition based on Extremal Region(ER)

Tracking the latest progress in Scene Text Detection and Recognition: Must-read papers well organized

OCR, Scene-Text-Understanding, Text Recognition

This project modify tensorflow object detection api code to predict oriented bounding boxes. It can be used for scene text detection.

A novel region proposal network for more general object detection ( including scene text detection ).

An Implementation of the alogrithm in paper IncepText: A New Inception-Text Module with Deformable PSROI Pooling for Multi-Oriented Scene Text Detection

A curated list of resources for text detection/recognition (optical character recognition ) with deep learning methods.

A curated list of resources dedicated to scene text localization and recognition

MORAN: A Multi-Object Rectified Attention Network for Scene Text Recognition

Scene text recognition

Code for the AAAI 2018 publication "SEE: Towards Semi-Supervised End-to-End Scene Text Recognition"

Convolutional Recurrent Neural Networks(CRNN) for Scene Text Recognition

Code for the paper STN-OCR: A single Neural Network for Text Detection and Text Recognition

Multi-Oriented Scene Text Detection via Corner Localization and Region Segmentation

AdvancedEAST is an algorithm used for Scene image text detect, which is primarily based on EAST, and the significant improvement was also made, which make long text predictions more accurate.https://github.com/huoyijie/raspberrypi-car