A toolbox of scene text detection and recognition

Overview
Comments
  • lmdb格式数据集转换代码可以提供一下嘛?

    lmdb格式数据集转换代码可以提供一下嘛?

    
    # -*- coding: utf-8 -*-
    import argparse
    import glob
    import io
    import os
    import pathlib
    import threading
    
    import cv2 as cv
    import lmdb
    import matplotlib.pyplot as plt
    import numpy as np
    from PIL import Image
    from tqdm import tqdm
    
    # plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文
    # plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号
    
    root_path = pathlib.Path('/root/autodl-tmp/hwdb')
    
    output_path = os.path.join(root_path, pathlib.Path('lmdb'))
    train_path = os.path.join(root_path, pathlib.Path('train_3755'))
    val_path = os.path.join(root_path, pathlib.Path('test'))
    
    characters = []
    
    with open('../character-3755.txt', 'r', encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            char = line.strip()
            characters.append(char)
    
    
    def write_cache(env, cache):
        with env.begin(write=True) as txn:
            for k, v in cache.items():
                if isinstance(v, bytes):
                    # 图片类型为bytes
                    txn.put(k.encode(), v)
                else:
                    # 标签类型为str, 转为bytes
                    txn.put(k.encode(), v.encode())  # 编码
    
    
    def create_dataset(env, image_path, label, index):
        n_samples = len(image_path)
        # map_size=1073741824 定义最大空间是1GB
        cache = {}
        cnt = index + 1
        for idx in range(n_samples):
            # 读取图片路径和对应的标签
            image = image_path[idx]
            if not os.path.exists(image):
                print('%s does not exist' % image)
                continue
            with open(image, 'rb') as fs:
                image_bin = fs.read()
            # .mdb数据库文件保存了两种数据,一种是图片数据,一种是标签数据,它们各有其key
            image_key = 'image-%09d' % cnt
            label_key = 'label-%09d' % cnt
            cache[image_key] = image_bin
            cache[label_key] = label
            cnt += 1
        if len(cache) != 0:
            write_cache(env, cache)
        return n_samples
    
    
    def show_image(samples):
        plt.figure(figsize=(20, 10))
        for pos, sample in enumerate(samples):
            plt.subplot(4, 5, pos + 1)
            plt.imshow(sample[0])
            # plt.title(sample[1])
            plt.xticks([])
            plt.yticks([])
            plt.axis("off")
        plt.show()
    
    
    def lmdb_test(root):
        env = lmdb.open(
            root,
            max_readers=1,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False)
    
        if not env:
            print('cannot open lmdb from %s' % root)
            return
    
        with env.begin(write=False) as txn:
            n_samples = int(txn.get('num-samples'.encode()))
    
        with env.begin(write=False) as txn:
            samples = []
            for index in range(1, n_samples + 1):
                img_key = 'image-%09d' % index
                img_buf = txn.get(img_key.encode())
                buf = io.BytesIO()
                buf.write(img_buf)
                buf.seek(0)
                try:
                    img = Image.open(buf)
                except IOError:
                    print('Corrupted image for %d' % index)
                    return
                label_key = 'label-%09d' % index
                label = str(txn.get(label_key.encode()).decode('utf-8'))
                print(n_samples, len(img.split()), label)
                samples.append([img, label])
                if index == 5:
                    # show_image(samples)
                    # samples = []
                    break
    
    
    def lmdb_init(directory, out, left, right):
        entries = characters[left:right]
        pbar = tqdm(entries)
        n_samples = 0
    
        # 计算所需内存空间
        character_count = len(entries)
        image_path = glob.glob(os.path.join(directory, entries[0], '*.png'))
        image_cnt = len(image_path)
        data_size_per_img = cv.imdecode(np.fromfile(image_path[0], dtype=np.uint8), cv.IMREAD_UNCHANGED).nbytes
        # 一个类中所有图片的字节数
        data_size = data_size_per_img * image_cnt
        # 所有类的图片字节数
        total_byte = 2 * data_size * character_count
        # 创建lmdb文件
        if not os.path.exists(out):
            os.makedirs(out)
        env = lmdb.open(out, map_size=total_byte)
        for dir_name in pbar:
            image_path = glob.glob(os.path.join(directory, dir_name, '*.png'))
            label = dir_name
            n_samples += create_dataset(env, image_path, label, n_samples)
            pbar.set_description(
                f'character[{left + 1}:{right}]: {label} | nSamples: {n_samples} | total_byte: {total_byte}byte | progressing')
    
        write_cache(env, {'num-samples': str(n_samples)})
        env.close()
    
    
    def begin(mode, left, right, valid=False):
        if mode == 'train':
            path = os.path.join(output_path, pathlib.Path(mode + '_' + str(right)))
            if not valid:
                lmdb_init(train_path, path, left=left, right=right)
            else:
                print(f"show:{valid},path:{path}")
                lmdb_test(path)
        elif mode == 'test':
            path = os.path.join(output_path, pathlib.Path(mode + '_' + str(right - left)))
            if not valid:
                lmdb_init(val_path, path, left=left, right=right)
            else:
                print(f"show:{valid},path:{path}")
                lmdb_test(path)
    
    
    class MyThread(threading.Thread):
        def __init__(self, mode, left, right, valid):
            threading.Thread.__init__(self)
            self.mode = mode
            self.left = left
            self.right = right
            self.valid = valid
    
        def run(self):
            begin(mode=self.mode, left=self.left, right=self.right, valid=self.valid)
    
    
    if __name__ == '__main__':
        """
        train_500: 3755类前500个类[1,500] = [0, 500)
        train_1000: 3755类第501到1000类[501,1000] = [500, 1000)
        train_1500: 3755类第1001到1500类[1001,1500] = [1000, 1500)
        train_2000: 3755类第1501到2000类[1501,2000] = [1500, 2000)
        train_2755: 3755类第2001到2755类[2001,2755] = [2000, 2755)
        train_3755: 3755类第2756到3755类[2756,3755] = [2755, 3755)
        test_1000: 3755类后1000类[2756,3755] = [2755, 3755)
        """
        parser = argparse.ArgumentParser()
    
        parser.add_argument("--train", action="store_true", help="generate train lmdb")
        parser.add_argument("--test", action="store_true", help="generate test lmdb")
        parser.add_argument("--all", action="store_true", help="generate all lmdb")
        parser.add_argument("--show", action="store_true", help="show result")
        parser.add_argument("--start", type=int, default=0, help="class start from where,default 0")
        parser.add_argument("--end", type=int, default=3755, help="class end from where,default 3755")
    
        args = parser.parse_args()
    
        train = args.train
        test = args.test
        build_all = args.all
        start = args.start
        end = args.end
        show = args.show
    
        if train:
            print(f"args: mode=train, [start:end)=[{start}:{end})")
            begin(mode='train', left=start, right=end, valid=show)
        if test:
            print(f"args: mode=test, [start:end)=[{start}:{end})")
            begin(mode='test', left=start, right=end, valid=show)
        if build_all:
            s = [0, 500, 1000, 1500, 2000, 2755]
            step = [500, 500, 500, 500, 755, 1000]
            m = ['5*train', '1*test']
            threads = []
            threadLock = threading.Lock()
            mode_index = 0
            for i in range(len(m)):
                tmp = m[i].strip().split("*")
                for j in range(int(tmp[0])):
                    if show:
                        begin(mode=tmp[1], left=s[mode_index], right=s[mode_index] + step[mode_index], valid=show)
                    else:
                        thread = MyThread(mode=tmp[1], left=s[mode_index],
                                          right=s[mode_index] + step[mode_index], valid=show)
                        threads.append(thread)
                        thread.start()
                    mode_index += 1
    
            for t in threads:
                t.join()
    
    
    opened by yeahQing 5
  • 【time】how long does it take to train?

    【time】how long does it take to train?

    【time】how long does it take to train?


    【moran、aster】how to set recognition head to moran or aster? and Do I need to retrain every time the recognition head is set?

    thank you

    opened by Lz-2019317 5
  • Dataset is not opening

    Dataset is not opening

    I was trying to open the .mdb dataset files, but I could not access and read them. I have MS access, and I tried other options too. Could you help me out?

    opened by kanika02 4
  • 请问能否提供TR-PSNR/TR-SSIM需要用到的segmentation mask?

    请问能否提供TR-PSNR/TR-SSIM需要用到的segmentation mask?

    您好,您论文中所提到的自建指标TR-PSNR/TR-SSIM需要用到text region 的segmentation mask,但下载的数据中貌似没有包含这一部分的内容,请问您是否能提供用于计算指标所用到的mask呢,或者训练mask的UNet的具体训练参数(论文和补充材料中都没有对训练的具体环境和参数有完整描述)?

    opened by Imalne 3
  • Checkpoint experiment directory is cleaned (all models destroyed) upon testing

    Checkpoint experiment directory is cleaned (all models destroyed) upon testing

    Firstly there isn't full usage guidance for this so I'm making some assumptions based on the TextZoom repo this was adapted from, please advise if I'm better off using it differently!

    To train the model I ran:

    python main.py --batch_size=32 --STN --mask --exp_name louis --text_focus
    

    which successfully put checkpoint.pth and model_best.pth in checkpoints/louis/, however when I went to test these it printed

    Clean the old checkpoint louis
    

    and then complained:

    FileNotFoundError: [Errno 2] No such file or directory: './checkpoint/louis/checkpoint.pth'
    

    ...so the saved model state was destroyed by trying to test it... Am I calling it wrong somehow? I suspect this should only be done when training (not testing) a model of the same experiment name as a pre-existing one.

    documentation 
    opened by lmmx 3
  • super resolution.py

    super resolution.py

    image

    Do we have to change these paths ( folder_GT and folder_GN)in super_resolution.py?

    Also, can we find the PSNR and SSIM of all images and store them in a CSV? How to do that?

    opened by kanika02 2
  • 【block】class TBSRN:in the tbsrn.py file,what does  it mean? Thank you very much!

    【block】class TBSRN:in the tbsrn.py file,what does it mean? Thank you very much!

    def forward:
                 ......
                 for i in range(self.srb_nums + 1):
                            block[str(i + 2)] = getattr(self, 'block%d' % (i + 2))(block[str(i + 1)])
                
                        block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
                            ((block['1'] + block[str(self.srb_nums + 2)]))
                        output = torch.tanh(block[str(self.srb_nums + 3)])
    

    First quention: block means basic unit of tbsrn,right? i guess: in the class of TBSRN,block1 means conv1 ? block2-7 means TBSRN-n ? where n=srb_nums? but initinal param srb_nums euqals 5. block8 means subsampling block ?


    Sencond quention: in the def forward: how do upsampling block implement? i just saw the code: block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ ((block['1'] + block[str(self.srb_nums + 2)])) Thank you~~~~~~

    opened by Lz-2019317 1
  • About weight_cross_entropy

    About weight_cross_entropy

    This paper is quite impressive research.

    I have a question about implementation.

    In this paper, weight cross-entropy (Content-Aware Module) is employed for calculating loss between the prediction of SR image and gt. However, the code seems to calculate loss between the prediction of HR image and gt. https://github.com/FudanVI/FudanOCR/blob/342eb1c79ed23abe2ed944a28d7361adacf693af/scene-text-telescope/loss/text_focus_loss.py#L97

    bug 
    opened by terryoo 1
  • Demo will not run due to tensor size mismatch

    Demo will not run due to tensor size mismatch

    The demo won't run for me due to a tensor size mismatch. I'm calling it as:

    python main.py --demo --demo_dir='./demo/' --resume='./checkpoint/louis/model_best.pth' --STN --mask --exp_name louis
    

    The mismatch is arising on line 367 of interfaces/super_resolution.py at:

                images_sr = model(images_lr)
    

    The traceback reports "got 1024 and 8192 (The offending index is 0)":

    Namespace(STN=True, arch='tbsrn', batch_size=None, demo=True, demo_dir='./demo/', exp_name='louis', hd_u=32, mask=True, mixed=False, rec='crnn', resume='./checkpoint/louis/model_best.pth', srb=5, syn=False, test=False, test_data_dir='./dataset/mydata/test/easy', text_focus=False)
    loading pre-trained model from ./checkpoint/louis/model_best.pth 
    Total Parameters 3220992
    loading pretrained crnn model from ./dataset/mydata/crnn.pth
      0%|                                                                                                                                                                                                                                                                                                                            | 0/3 [00:00<?, ?it/s]
    Traceback (most recent call last):
      File "main.py", line 40, in <module>
        main(config, args)
      File "main.py", line 13, in main
        Mission.demo()
      File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/interfaces/super_resolution.py", line 367, in demo
        images_sr = model(images_lr)
      File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
        return forward_call(*input, **kwargs)
      File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 166, in forward
        return self.module(*inputs[0], **kwargs[0])
      File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
        return forward_call(*input, **kwargs)
      File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/model/tbsrn.py", line 221, in forward
        block[str(i + 2)] = getattr(self, 'block%d' % (i + 2))(block[str(i + 1)])
      File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
        return forward_call(*input, **kwargs)
      File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/model/tbsrn.py", line 255, in forward
        residual = self.feature_enhancer(residual)
      File "/home/louis/miniconda3/envs/sttsr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
        return forward_call(*input, **kwargs)
      File "/home/louis/dev/sr/FudanOCR/scene-text-telescope/model/tbsrn.py", line 85, in forward
        conv_feature = torch.cat([conv_feature, position2d],1) # batch, 128(64+64), 32, 128
    RuntimeError: Sizes of tensors must match except in dimension 2. Got 1024 and 8192 (The offending index is 0)
    

    This pretrained model is the one provided in Dropbox (have you switched to a different model since uploading perhaps?)

    bug 
    opened by lmmx 1
  • lmdb.Error: ./data/mydata/train_data.mdb: Not a directory

    lmdb.Error: ./data/mydata/train_data.mdb: Not a directory

    Hello, my experiment has been this error, lmdb.Error: ./data/mydata/train_data.mdb: Not a directory. Have you ever come across such a situation? How can I solve it? I would appreciate it if you could help me.

    opened by cptbtptp125 0
  • How do I convert HWDB data sets to LMDB format?

    How do I convert HWDB data sets to LMDB format?

    Hello, I have tried many methods, but failed to convert HWDB data set, could you please provide how to convert HWDB to LMDB?

    I will appreciate it very much.

    opened by cptbtptp125 0
  • How is the.pkl file generated in print_font_template in Stroke-level-decomposition?

    How is the.pkl file generated in print_font_template in Stroke-level-decomposition?

    How is the.pkl file generated in print_font_template in Stroke-level-decomposition? Is there any corresponding support sample for stroke level recognition in Korean?I would appreciate it if you could help me.

    opened by cptbtptp125 0
  • Why can't I reproduce the results in stroke-level-decomposition?

    Why can't I reproduce the results in stroke-level-decomposition?

    Hi, everyone! First of all, I'm very excited that you guys can share your works. I ran your code in default config, only the datasets are generated by myself. Unfortunately, I am getting bad results on train_500 and test_1000, the results are shown below. I don't know why would cause this problem, the accuracy is only have around 0.5% after 50 epoch.

    Tips: epoch is a display error.

    Epoch : 0 | ACC : 0.00011712148844680174
    Epoch : 0 | ACC : 0.003697692706677598
    Epoch : 0 | ACC : 0.003998862248397945
    Epoch : 0 | ACC : 0.0030284270584101593
    Epoch : 0 | ACC : 0.002325698127729349
    Epoch : 0 | ACC : 0.005103150568039219
    Epoch : 0 | ACC : 0.004818712667525558
    Epoch : 0 | ACC : 0.004701591179078756
    Epoch : 0 | ACC : 0.0047685177439055
    Epoch : 0 | ACC : 0.005036224003212475
    Epoch : 0 | ACC : 0.0052202720564860205
    Epoch : 0 | ACC : 0.0052202720564860205
    Epoch : 0 | ACC : 0.005103150568039219
    Epoch : 0 | ACC : 0.005203540415279335
    Epoch : 0 | ACC : 0.0052202720564860205
    Epoch : 0 | ACC : 0.005136613850452591
    Epoch : 0 | ACC : 0.005337393544932822
    Epoch : 0 | ACC : 0.005052955644419161
    Epoch : 0 | ACC : 0.005153345491659277
    Epoch : 0 | ACC : 0.005203540415279335
    Epoch : 0 | ACC : 0.004952565797179045
    Epoch : 0 | ACC : 0.0052202720564860205
    Epoch : 0 | ACC : 0.005186808774072649
    Epoch : 0 | ACC : 0.004952565797179045
    Epoch : 0 | ACC : 0.00538758846855288
    Epoch : 0 | ACC : 0.005504709956999682
    Epoch : 0 | ACC : 0.0051700771328659625
    Epoch : 0 | ACC : 0.005521441598206368
    Epoch : 0 | ACC : 0.005203540415279335
    Epoch : 0 | ACC : 0.005504709956999682
    Epoch : 0 | ACC : 0.005186808774072649
    Epoch : 0 | ACC : 0.005186808774072649
    Epoch : 0 | ACC : 0.005421051750966252
    Epoch : 0 | ACC : 0.005571636521826426
    Epoch : 0 | ACC : 0.005036224003212475
    Epoch : 0 | ACC : 0.00538758846855288
    Epoch : 0 | ACC : 0.005186808774072649
    Epoch : 0 | ACC : 0.005237003697692707
    Epoch : 0 | ACC : 0.004952565797179045
    Epoch : 0 | ACC : 0.005153345491659277
    Epoch : 0 | ACC : 0.0051700771328659625
    Epoch : 0 | ACC : 0.005153345491659277
    Epoch : 0 | ACC : 0.005086418926832533
    Epoch : 0 | ACC : 0.004986029079592417
    
    opened by yeahQing 5
  • transformer预训练模型是如何训练的?

    transformer预训练模型是如何训练的?

    作者您好!以我目前的浅薄理解,训练的过程是transformer模型直接加载笔划权重pretrain_transformer_stroke_decomposition.pth,计算sr图与hr图的结果,l1loss回传给生成模型,预测过程是lr图经过生成模型获取sr图,使用crnn直接预测结果吗?transformer模型的参数在中途是不是不变啊,笔划部分是如何训练的呢?

    opened by dzyanshan 0
  • How can I eveluate TBSRN on Aster and Moran?

    How can I eveluate TBSRN on Aster and Moran?

    I prepare pretrain model of Aster and Moran, but there are no Aster/Moran files in this repository. I directly copy the corrspondings in TextZoom to the correct path but face with running issues. Screen Shot 2022-08-09 at 11 04 20

    opened by MinghaoFu 0
Owner
FudanVIC Team
Visual Intelligence & Cognition Team at Fudan University
FudanVIC Team
Motion detector, Full body detection, Upper body detection, Cat face detection, Smile detection, Face detection (haar cascade), Silverware detection, Face detection (lbp), and Sending email notifications

Security camera running OpenCV for object and motion detection. The camera will send email with image of any objects it detects. It also runs a server that provides web interface with live stream video.

Peace 10 Jun 30, 2021
OpenMMLab Text Detection, Recognition and Understanding Toolbox

Introduction English | 简体中文 MMOCR is an open-source toolbox based on PyTorch and mmdetection for text detection, text recognition, and the correspondi

OpenMMLab 3k Jan 7, 2023
A curated list of papers and resources for scene text detection and recognition

Awesome Scene Text A curated list of papers and resources for scene text detection and recognition The year when a paper was first published, includin

Jan Zdenek 43 Mar 15, 2022
End-to-end pipeline for real-time scene text detection and recognition.

Real-time-Scene-Text-Detection-and-Recognition-System End-to-end pipeline for real-time scene text detection and recognition. The detection model use

Fangneng Zhan 89 Aug 4, 2022
Scene text detection and recognition based on Extremal Region(ER)

Scene text recognition A real-time scene text recognition algorithm. Our system is able to recognize text in unconstrain background. This algorithm is

HSIEH, YI CHIA 155 Dec 6, 2022
Tracking the latest progress in Scene Text Detection and Recognition: Must-read papers well organized

SceneTextPapers Tracking the latest progress in Scene Text Detection and Recognition: must-read papers well organized Information about this repositor

Shangbang Long 763 Jan 1, 2023
OCR, Scene-Text-Understanding, Text Recognition

Scene-Text-Understanding Survey [2015-PAMI] Text Detection and Recognition in Imagery: A Survey paper [2014-Front.Comput.Sci] Scene Text Detection and

Alan Tang 354 Dec 12, 2022
This project modify tensorflow object detection api code to predict oriented bounding boxes. It can be used for scene text detection.

This is an oriented object detector based on tensorflow object detection API. Most of the code is not changed except for those related to the need of

Dafang He 30 Oct 22, 2022
A novel region proposal network for more general object detection ( including scene text detection ).

DeRPN: Taking a further step toward more general object detection DeRPN is a novel region proposal network which concentrates on improving the adaptiv

Deep Learning and Vision Computing Lab, SCUT 151 Dec 12, 2022
An Implementation of the alogrithm in paper IncepText: A New Inception-Text Module with Deformable PSROI Pooling for Multi-Oriented Scene Text Detection

InceptText-Tensorflow An Implementation of the alogrithm in paper IncepText: A New Inception-Text Module with Deformable PSROI Pooling for Multi-Orien

GeorgeJoe 115 Dec 12, 2022
A curated list of resources for text detection/recognition (optical character recognition ) with deep learning methods.

awesome-deep-text-detection-recognition A curated list of awesome deep learning based papers on text detection and recognition. Text Detection Papers

null 2.4k Jan 8, 2023
A curated list of resources dedicated to scene text localization and recognition

Scene Text Localization & Recognition Resources A curated list of resources dedicated to scene text localization and recognition. Any suggestions and

CarlosTao 1.6k Dec 22, 2022
MORAN: A Multi-Object Rectified Attention Network for Scene Text Recognition

MORAN: A Multi-Object Rectified Attention Network for Scene Text Recognition Python 2.7 Python 3.6 MORAN is a network with rectification mechanism for

Canjie Luo 595 Dec 27, 2022
Scene text recognition

AttentionOCR for Arbitrary-Shaped Scene Text Recognition Introduction This is the ranked No.1 tensorflow based scene text spotting algorithm on ICDAR2

null 777 Jan 9, 2023
Code for the AAAI 2018 publication "SEE: Towards Semi-Supervised End-to-End Scene Text Recognition"

SEE: Towards Semi-Supervised End-to-End Scene Text Recognition Code for the AAAI 2018 publication "SEE: Towards Semi-Supervised End-to-End Scene Text

Christian Bartz 572 Jan 5, 2023
Convolutional Recurrent Neural Networks(CRNN) for Scene Text Recognition

CRNN_Tensorflow This is a TensorFlow implementation of a Deep Neural Network for scene text recognition. It is mainly based on the paper "An End-to-En

MaybeShewill-CV 1000 Dec 27, 2022
Code for the paper STN-OCR: A single Neural Network for Text Detection and Text Recognition

STN-OCR: A single Neural Network for Text Detection and Text Recognition This repository contains the code for the paper: STN-OCR: A single Neural Net

Christian Bartz 496 Jan 5, 2023
Multi-Oriented Scene Text Detection via Corner Localization and Region Segmentation

This is the official implementation of "Multi-Oriented Scene Text Detection via Corner Localization and Region Segmentation". For more details, please

Pengyuan Lyu 309 Dec 6, 2022
huoyijie 1.2k Dec 29, 2022