项目链接:https://github.com/facebookresearch/detr
该项目使用transformer集合预测图像,直接输出目标检测中框的坐标和类别ground truth比较优化模型不需要整个过程NMS等待后处理操作,实现真正的端到端模型。记录项目的主要结构和代码
DETR结构及前进过程

项目结构
- models
- position_encoding.py–存储三角函数实现的位置编码和二维空间位置编码在绝对位置学习
- transformer.py–区别与原版transformer,主要是在encoder和decedr中的q、k、v的设置方式
- matcher.py–使用匈牙利算法从输出的100个集合中给每个ground truth匹配合适的目标框
- backbone.py–使用torch内置的resnet作为backbone抽取图像的feature map
- detr.py–
- segmentation.py–
位置编码–position_encoding.py
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Various positional encodings for the transformer. """ import math import torch from torch import nn from util.misc import NestedTensor class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. 类似于transformer三角函数实现的标准位置嵌入层,将每个位置的每个维度映射到角度,即scale默认为0的参数~2Π """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi # 角度范围为0~2Π self.scale = scale def forward(self, tensor_list: NestedTensor):
x = tensor_list.tensors # 获取图像中的图像张量,(batch_size,c,h,w)
mask = tensor_list.mask # mask为True表示该部分对应的像素值是通过padding构造的,(batch_size,h,w)
assert mask is not None
not_mask = ~mask # 此处取反,即not_mask为True对应的像素值就是实际图像的像素值
y_embed = not_mask.cumsum(1, dtype=torch.float32) # not_mask在列(垂直)方向累加,数据由布尔型转为浮点型,(batch_size,h,w)
x_embed = not_mask.cumsum(2, dtype=torch.float32) # not_mask在行(水平)方向累加,数据由布尔型转为浮点型,(batch_size,h,w)
# 根据上面的行列累加操作,每行每列的值映射为不同的值
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale # 列方向做归一化
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale # 行方向做归一化
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) # 构造计算公式三角函数中数值的坟墓
pos_x = x_embed[:, :, :, None] / dim_t # (batch_size,h,w,num_pos_feats)
pos_y = y_embed[:, :, :, None] / dim_t # (batch_size,h,w,num_pos_feats)
# 在最后一维中,偶数维上使用正弦函数,奇数维上使用余弦函数
# (batch_size,h,w,num_pos_feats//2)+(batch_size,h,w,num_pos_feats//2)->(batch_size,h,w,num_pos_feats//2,2)->(batch_size,h,w,2*(num_pos_feats//2))
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
# (batch_size,h,w,num_pos_feats//2)+(batch_size,h,w,num_pos_feats//2)->(batch_size,h,w,num_pos_feats//2,2)->(batch_size,h,w,2*(num_pos_feats//2))
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
# (batch_size,h,w,2 *(num_pos_feats // 2))+(batch_size,h,w,2 *(num_pos_feats // 2))->(batch_size,h,w,2*num_pos_feats)->(batch_size,2*num_pos_feats,h,w)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
class PositionEmbeddingLearned(nn.Module):
""" Absolute pos embedding, learned.可以学习的绝对位置嵌入层 """
def __init__(self, num_pos_feats=256):
super().__init__()
# 行列编码的第一个维度都是50表示默认backbone采集的图像feature map不大于50X50
self.row_embed = nn.Embedding(50, num_pos_feats)
self.col_embed = nn.Embedding(50, num_pos_feats)
self.reset_parameters()
def reset_parameters(self): # 设置嵌入层参数权重
nn.init.uniform_(self.row_embed.weight)
nn.init.uniform_(self.col_embed.weight)
def forward(self, tensor_list: NestedTensor):
x = tensor_list.tensors
h, w = x.shape[-2:]
# 一行中的每个位置
i = torch.arange(w, device=x.device) # 因为是对行进行编码,就是对该行上所有的列值进行编码
# 一列中的每个位置
j = torch.arange(h, device=x.device) # 因为是对列进行编码,就是对该列上所有的行值进行编码
x_emb = self.col_embed(i) # 将一行中的每个列位置进行编码,(w, num_pos_feats)
y_emb = self.row_embed(j) # 将一列中的每个行位置进行编码,(h, num_pos_feats)
pos = torch.cat([
# (w, num_pos_feats)-> (1, w, num_pos_feats)-> (h, w, num_pos_feats);即将x_emb按行(垂直)方向重复h此,所有行同一列的编码结果一样
x_emb.unsqueeze(0).repeat(h, 1, 1),
# (h, num_pos_feats)-> (h, 1, num_pos_feats)-> (h, w, num_pos_feats);即将y_emb按列(水平)方向重复h此,所有列同一行的编码结果一样
y_emb.unsqueeze(1).repeat(1, w, 1), ],
dim=-1 # 将上面两个向量在最后一维上concat,即(h, w, num_pos_feats)+(h, w, num_pos_feats)->(h, w, 2*num_pos_feats)
).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) # (h, w, 2*num_pos_feats)->(2*num_pos_feats,h, w)->(1,2*num_pos_feats,h, w)->(batch_size,2*num_pos_feats,h, w)
return pos
def build_position_encoding(args):
N_steps = args.hidden_dim // 2 # 因为对行、列分类编码,所以维度是隐状态维度的一半
if args.position_embedding in ('v2', 'sine'): # 构建三角函数实现的路径位置编码
# TODO find a better way of exposing other arguments
position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
elif args.position_embedding in ('v3', 'learned'): # 构建可学习的绝对路径二维空间位置编码
position_embedding = PositionEmbeddingLearned(N_steps)
else:
raise ValueError(f"not supported {
args.position_embedding}")
return position_embedding
其中三角函数位置编码的计算公式如下图所示
Transformer–DETR使用的Transformer与原始本本不同
DETR的encoder和decoder结构如下图所示;其中spatial positional encoding是二维空间位置编码方法,该编码分别被加入到encoder的self attention和decoder的cross attention,同时object queires也被加入到decoder的两个attention中
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ DETR Transformer class. Copy-paste from torch.nn.Transformer with modifications: * positional encodings are passed in MHattention * extra LN at the end of encoder is removed * decoder returns a stack of activations from all decoding layers """ import copy from typing import Optional, List import torch import torch.nn.functional as F from torch import nn, Tensor class Transformer(nn.Module): def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, return_intermediate_dec=False): """ 初始化transformer @param d_model:每个token转为向量后的维度 @param nhead:注意力头数 @param num_encoder_layers:encoder层数 @param num_decoder_layers:decoder层数 @param dim_feedforward: @param dropout: @param activation:激活函数 @param normalize_before:如果设置为True,layerNorm是在编码/解码之前进行 @param return_intermediate_dec: """ super().__init__() encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation, normalize_before) # 一个encoder层 encoder_norm = nn.LayerNorm(d_model) if normalize_before else None # encoder中的LayerNorm层 self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) # 整个的encoder decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation, normalize_before) # 一个decoder层 decoder_norm = nn.LayerNorm(d_model) # decoder中的LayerNorm层 self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec) # 整个的decoder self._reset_parameters() # 对模型的所有参数进行Xavier初始化 self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, query_embed, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape # 分别是batch_size, channels,height, weight # NxCxHxW -> NxCxHW -> HWxNxC,相当于从从图片中抽取的特征图的长宽之积转换为序列的长度,N还是batch_size,通道数C变为嵌入层输出大小或隐向量大小 src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) mask = mask.flatten(1) tgt = torch.zeros_like(query_embed) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) # encoder中src是k、v、q hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed) return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) class TransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers, norm=None): """ TransformerEncoder初始化 @param encoder_layer:nn.Module,即单个的encoder_layer @param num_layers:int, encoder_layer的层数 @param norm: """ super().__init__() self.layers = _get_clones(encoder_layer, num_layers) # 生成num_layers个参数不共享的encoder_layer self.num_layers = num_layers self.norm = norm def forward(self, src, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None): output = src for layer in self.layers: output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos) if self.norm is not None: output = self.norm(output) return output class TransformerDecoder(nn.Module): def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) # 生成num_layer个参数不共享的decoder_layer层 self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): output = tgt intermediate = [] # 每个decoder_layer的输出值 for layer in self.layers: output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos) if self.return_intermediate: # 如果需要输出中间值,就把每个decoder_layer的输出值记录 intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) # 对最后一层decoer_layer的输出进行layerNorm if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate) return output.unsqueeze(0) class TransformerEncoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # endoer中的self attention层 # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): # 在tensor上添加位置编码 return tensor if pos is None else tensor + pos # layerNorm在最后进行 def forward_post(self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None): q = k = self.with_pos_embed(src, pos) # q、k是在src的基础上加上位置编码 # self_attention中的v是无位置编码的src src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] # 经过self_attention层后的输出 src = src + self.dropout1(src2) # residual src = self.norm1(src) # 正则化 src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) # 经过两个全连接层的FFN的输出 src = src + self.dropout2(src2) src = self.norm2(src) return src # encoder的输出,尺寸没有改变 # layerNorm在最前进行 def forward_pre(self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None): src2 = self.norm1(src) # 先进行LayerNorm q = k = self.with_pos_embed(src2, pos) src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) src = src + self.dropout2(src2) return src def forward(self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = 标签:bsz808a振动传感器变送器