I tried following the usage instructions you posted on a sample .jpg image of a receipt.
Every time I run it, I get an error saying, "RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 3, 7, 7], but got 3-dimensional input of size [3, 384, 500] instead".
How do I fix that?
Full code:
import pytesseract
import sys
import modeling, dataset
from transformers import BertTokenizerFast
config = {
"coordinate_size": 96,
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"image_feature_pool_shape": [7, 7, 256],
"intermediate_ff_size_factor": 4,
"max_2d_position_embeddings": 1000,
"max_position_embeddings": 512,
"max_relative_positions": 8,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"shape_size": 96,
"vocab_size": 30522,
"layer_norm_eps": 1e-12,
fp = "images/data_sample.jpg"
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
encoding = dataset.create_features(fp, tokenizer)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
feature_extractor = modeling.ExtractFeatures(config)
docformer = modeling.DocFormerEncoder(config)
v_bar, t_bar, v_bar_s, t_bar_s = feature_extractor(encoding)
output = docformer(v_bar, t_bar, v_bar_s, t_bar_s) # shape (1, 512, 768)
Full error:
RuntimeError Traceback (most recent call last)
Input In [3], in <module>
31 feature_extractor = modeling.ExtractFeatures(config)
32 docformer = modeling.DocFormerEncoder(config)
---> 34 v_bar, t_bar, v_bar_s, t_bar_s = feature_extractor(encoding)
35 output = docformer(v_bar, t_bar, v_bar_s, t_bar_s)
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Documents\Projects\docformer_implementation\docformer/src/docformer\modeling.py:512, in ExtractFeatures.forward(self, encoding)
509 x_feature = encoding['x_features']
510 y_feature = encoding['y_features']
--> 512 v_bar = self.visual_feature(image)
513 t_bar = self.language_feature(language)
515 v_bar_s, t_bar_s = self.spatial_feature(x_feature, y_feature)
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~\Documents\Projects\docformer_implementation\docformer/src/docformer\modeling.py:48, in ResNetFeatureExtractor.forward(self, x)
47 def forward(self, x):
---> 48 x = self.resnet50(x)
49 x = self.conv1(x)
50 x = self.relu1(x)
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\container.py:141, in Sequential.forward(self, input)
139 def forward(self, input):
140 for module in self:
--> 141 input = module(input)
142 return input
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\conv.py:446, in Conv2d.forward(self, input)
445 def forward(self, input: Tensor) -> Tensor:
--> 446 return self._conv_forward(input, self.weight, self.bias)
File ~\anaconda3\envs\docformer_env\lib\site-packages\torch\nn\modules\conv.py:442, in Conv2d._conv_forward(self, input, weight, bias)
438 if self.padding_mode != 'zeros':
439 return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
440 weight, bias, self.stride,
441 _pair(0), self.dilation, self.groups)
--> 442 return F.conv2d(input, weight, bias, self.stride,
443 self.padding, self.dilation, self.groups)
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 3, 7, 7], but got 3-dimensional input of size [3, 384, 500] instead