Spaces:
Running
Running
File size: 3,122 Bytes
591ba45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import torch
import torch.nn as nn
import torch.nn.functional as F
from DocScanner.extractor import BasicEncoder
from DocScanner.update import BasicUpdateBlock
def bilinear_sampler(img, coords, mode="bilinear", mask=False):
"""Wrapper for grid_sample, uses pixel coordinates"""
H, W = img.shape[-2:]
xgrid, ygrid = coords.split([1, 1], dim=-1)
xgrid = 2 * xgrid / (W - 1) - 1
ygrid = 2 * ygrid / (H - 1) - 1
grid = torch.cat([xgrid, ygrid], dim=-1)
img = F.grid_sample(img, grid, align_corners=True)
if mask:
mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
return img, mask.float()
return img
def coords_grid(batch, ht, wd):
coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
coords = torch.stack(coords[::-1], dim=0).float()
return coords[None].repeat(batch, 1, 1, 1)
class DocScanner(nn.Module):
def __init__(self):
super(DocScanner, self).__init__()
self.hidden_dim = hdim = 160
self.context_dim = 160
self.fnet = BasicEncoder(output_dim=320, norm_fn="instance")
self.update_block = BasicUpdateBlock(hidden_dim=hdim)
def freeze_bn(self):
for m in self.modules():
if isinstance(m, nn.BatchNorm2d):
m.eval()
def initialize_flow(self, img):
N, C, H, W = img.shape
coodslar = coords_grid(N, H, W).to(img.device)
coords0 = coords_grid(N, H // 8, W // 8).to(img.device)
coords1 = coords_grid(N, H // 8, W // 8).to(img.device)
return coodslar, coords0, coords1
def upsample_flow(self, flow, mask):
N, _, H, W = flow.shape
mask = mask.view(N, 1, 9, 8, 8, H, W)
mask = torch.softmax(mask, dim=2)
up_flow = F.unfold(8 * flow, [3, 3], padding=1)
up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
up_flow = torch.sum(mask * up_flow, dim=2)
up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
return up_flow.reshape(N, 2, 8 * H, 8 * W)
def forward(self, image1, iters=12, flow_init=None, test_mode=False):
image1 = image1.contiguous()
fmap1 = self.fnet(image1)
warpfea = fmap1
net, inp = torch.split(fmap1, [160, 160], dim=1)
net = torch.tanh(net)
inp = torch.relu(inp)
coodslar, coords0, coords1 = self.initialize_flow(image1)
if flow_init is not None:
coords1 = coords1 + flow_init
flow_predictions = []
for itr in range(iters):
coords1 = coords1.detach()
flow = coords1 - coords0
net, up_mask, delta_flow = self.update_block(net, inp, warpfea, flow)
coords1 = coords1 + delta_flow
flow_up = self.upsample_flow(coords1 - coords0, up_mask)
bm_up = coodslar + flow_up
warpfea = bilinear_sampler(fmap1, coords1.permute(0, 2, 3, 1))
flow_predictions.append(bm_up)
if test_mode:
return bm_up
return flow_predictions
|