Yolo v3

Loss

  
class Darknet53(nn.Module):
    def __init__(self, cfg, param, training):
        ...
        self.module_cfg = parse_model_config(cfg)
        self.module_list = self.set_layer(self.module_cfg)
        self.yolo_layers = [layer[0] for layer in self.module_list if isinstance(layer[0], Yololayer)]

진행하기 전에 Darknet53에서 yolo_layers를 추가한다. 이는 darknet53 클래스 내에서 module_list, 즉 우리가 만들어준 layer들에서 Yololayer에 해당하는 것만 저장되어 있는 멤버 변수이다. 추후에 yololayer에서의 anchor과 stride를 사용하기 위해 선언해주었다.

  
class Yololoss(nn.Module):
    def __init__(self, device, n_class):
        super(Yololoss, self).__init__()
        self.device = device
        self.n_class = n_class
        self.mseloss = nn.MSELoss().to(device) # mean squared entropy
        self.bceloss = nn.BCELoss().to(device) # binary cross entropy
        self.bcelogloss = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.0], device = device)).to(device) # take log for BCE        

추후 loss를 구할 때 사용할 mseloss와 bcelogloss를 선언해놓는다. mseloss의 경우 두 입력값의 제곱의 차를 통해 loss를 구하는 것이다. bcelogloss의 경우 bceloss에 log를 취해준 방법이다.

loss를 구하기 위해 함수를 생성한다.

  
    def compute_loss(self, pred, targets, yololayer):
        # loss_class, loss_box, loss_objectness
        lcls, lbox, lobj = torch.zeros(1, device = self.device), torch.zeros(1, device = self.device), torch.zeros(1, device = self.device) 
        
        # predict idx, bbox for 3 yolo layers
        for pidx, pout in enumerate(pred):
            # pout.shape : [batch, anchors, grid_h, grid_w, box_attrib]
            print("yolo {}, shape {}".format(pidx, pout.shape))

학습을 통해 예측된 값이 pred로 들어오고, targets가 GT값이다. 이를 실행해보면 출력은 다음과 같은 형태로 된다. 각 yololayer에서의 bbox의 개수를 구해보면, anchor * grid_h * grid_w 이다. 따라서 0번째 layer는 3 * 19 * 19이고, 3개의 layer에서의 총 box 개수는 22743개가 된다. 이 많은 box들에 대해 loss를 구하게 되면 연산량이 너무 많아진다.

  
yolo 0, shape torch.Size([2, 3, 19, 19, 13])
yolo 1, shape torch.Size([2, 3, 38, 38, 13])
yolo 2, shape torch.Size([2, 3, 76, 76, 13])

box들에는 positive prediction과 negative prediction이 존재한다. positive, negative란 예측한 bbox가 gt bbox와 얼마나 겹치는지를 계산한 후 특정 threshold보다 높으면 positive, 낮으면 negative이다. 우리가 loss를 구하는 종류에는 objectness 에 대한 loss, class score에 대한 loss, bbox에 대한 loss가 있어야 한다. 보통은 bbox loss와 class loss는 positive predict로만 구하고, objectness loss는 negative predict로만 구하는데, 만약 pos : neg = 0.01 : 0.99 의 비율을 가진다면, loss가 노이즈가 너무 많이 생기는 현상이 발생할 수 있다.

따라서 positive에 대한 값들만 추출해서 loss를 계산하고자 했다.

  
    def compute_loss(self, pred, targets, yololayer):
        # loss_class, loss_box, loss_objectness
        lcls, lbox, lobj = torch.zeros(1, device = self.device), torch.zeros(1, device = self.device), torch.zeros(1, device = self.device)

        tcls, tbox, tindices, tanchors = self.get_targets(pred, targets, yololayer)

먼저 추후에 저장할 각 loss들을 0으로 된 tensor로 선언해둔다. 그리고 positive에 대한 값들만 추출하기 위한 함수, get_targets를 선언한다.

들어가기 전 각 인자들의 shape를 살펴보자.

  
pred : torch.Size([2, 3, 19, 19, 13]), targets : torch.Size([9, 6])

pred : [batch, num of anchor, grid_y, grid_x, box_attrib]
targets : [num of targets in batch, (batch_id, class_id, box[4])]

  
    # for comparing prediction and gt conveniently, we transpose shape
    def get_targets(self, pred, targets, yololayer):
        num_anch = 3
        num_targets = targets.shape[0]                  # num of targets in batch
        tcls, tboxes, tindices, anch = [], [], [], []    # output : target_class, target_box, index, anchor

        gain = torch.ones(7, device=self.device)        # make targets to 7-dim, [batch_id, cls_id, cx, cy, w, h, anchor_id]

        # anchor index
        # ai.shape = (1x3) => 3x1, and repeat targets's num
        ai = torch.arange(num_anch, device=targets.device).float().view(num_anch, 1).repeat(1, num_targets)
        # to make targets to be anchor's number, targets.shape multiple anchor's num(3)
        
        targets = torch.cat((targets.repeat(num_anch, 1, 1), ai[:,:,None]), dim=2)
        #print("targets : ", targets.shape)
        # [batch_id, class_id, box_cx, box_cy, box_w, box_h, anchor_id]
        
        for yi, yl in enumerate(yololayer):
            # 각 yolo layer feature map에 맞게 설정
            # cfg 파일에서의 anchors는 608에 대한 값, 19x19, 38x38에 대한 값으로 만들어줘야 함
            anchors = yl.anchor / yl.stride 
            gain[2:6] = torch.tensor(pred[yi].shape)[[3,2,3,2]] # [1,1,grid_w, grid_h, grid_w, grid_h,1]

            # multiple [box_cx, box_cy,box_w,box_y] * grid size, to unpack normalize
            # targets's[2:6] is to be some number dependent on grid size
            t = targets * gain


            if num_targets:
                # in figure2 of yolov3 paper, w, h of bounding box is anchor size * exp(prediction's w) or exp(prediction's h)
                # so, r = exp(prediction_w) = box_w / anchor_w
                r = t[:,:,4:6] / anchors[:, None]
                # r.shape : torch.Size([3, 15, 2]) t.shape : torch.Size([3, 15, 7]) anchors[:, None].shape : torch.Size([3, 1, 2])

                # extract maximum exp(prediction_w)
                # select the ratios less than 4, remove the too large ratios
                # print(r)
                j = torch.max(r, 1. / r).max(dim = 2)[0] < 4
                # print("max : ", torch.max(r, 1. / r).max(dim = 2)[0])
                # print(j)

                t = t[j] # extract value for true
            else: # num_targets == 0
                t = targets[0]

            # batch_id, class_id with long and transpose using filtered data to had proper anchor shape 
            batch, cls = t[:, :2].long().T

            gt_xy = t[:, 2:4]
            gt_wh = t[:, 4:6]

            # define the Cx, Cy in figure2. Cx Cy is index of grid
            # if in 19x19 gt_xy is 17.2,17.3, Cx Cy about object is 17,17
            gt_ij = gt_xy.long() # make integer from float type
            gt_i, gt_j = gt_ij.T # make independent each value

            # anchor index
            a = t[:, 6].long()

            # add indices
            # clamp() : 19x19 이상의 값이 되지 않기 위해
            # always 0 < gt_j < grid_h -1 
            tindices.append((batch, a, gt_j.clamp(0, gain[3]-1), gt_i.clamp(0, gain[2]-1)))

            # add target box
            # prediction_x, prediction_y normalized is box_x - Cx, or box_y - Cy in figure2   
            # shape : [p_x, p_y, gt_w, gt_h]
            tboxes.append(torch.cat(gt_xy-gt_ij, gt_wh), dim=1)

            # add anchor
            # a is index of anchor box to guess positive box, so insert anchor box for indices
            anch.append(anchors[a])

            # add class
            tcls.append(cls)

        return tcls, tboxes, indices, anch

필요한 상수들을 지정해주고, 출력값인 tcls, tboxes, indices, anch를 먼저 할당해둔다.

gain : cfg 파일에서의 anchor 박스의 크기를 맞게 설정하고, box 좌표들의 정규화를 풀어주기 위한 변수이다.
ai : anchor index, 우리는 targets에 anchor id도 추가할 것이므로 arange를 통해 0,1,2 index를 만들고, 동일한 dtype을 맞추고, 연산의 정확성을 위해 float로 생성했으며, 생성한 arange는 (1,3)의 shape을 가지므로 이를 (3,1)으로 변환한다. 그리고 타겟의 수만큼 생성한다. num_targets : 9 , ai.shape : torch.Size([3, 9]) tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0.], [1., 1., 1., 1., 1., 1., 1., 1., 1.], [2., 2., 2., 2., 2., 2., 2., 2., 2.]])
targets : 각 target마다 anchor가 3개씩 있으므로 개수를 맞게 만들어주기 위해 repeat를 했다. 즉 각 anchor마다의 GT target 값을 가져야 하기 때문에 이를 맞춰주었다. ai는 2차원이므로 같은 차원으로 만들어줘야 concat이 되기에 마지막 차원을 None값으로 채워 같은 차원으로 맞추었다.
targets[0].shape : [batch_id, class_id, box_cx, box_cy, box_w, box_h, anchor_id]
- ai.shape : torch.Size([3, 12])
- targets before : torch.Size([12, 6])
- targets after : torch.Size([3, 12, 7])
  - 각 targets마다 7가지의 속성을 가지고 있는데, 이것을 anchor 개수만큼 생성했기 때문에 3,12,7이 되었다. 이 때, 12는 객체 개수이다.
yi, yl : yololayers index(0,1,2), yololayers’ layer
anchor : cfg 파일의 anchor들은 608크기에서의 값들이다. 그러므로 이를 feature map에 맞게 재설정해야 한다. 그래서 Yololayer에서 계산한 anchor들을 stride로 나눈다. stride란 feature map의 1grid가 가지는 픽셀의 값이다. 즉 19x19이면 input_size 608 / feature map size 19로 나눈 값이다.
- anchors.shape : torch.Size([3, 2])
t : 앞서 채워준 gain[2:6]에는 grid_w, grid_h, grid_w, grid_h 가 들어있고, 나머지는 1로 되어 있다. box좌표들의 정규화를 풀어주기 위해 targets와 gain을 곱한다.

r : 위의 그림을 보면 $ box_w = anchor_w * \exp{(t_w)} $ 이다. 따라서 $ r = exp(t_w) = box_w / anchor_w $이 된다. t는 [batch_id, class_id, box_cx, box_cy, box_w, box_h, anchor_id]의 형태를 가지고 있으므로, 4,5번째 index값인 box_w, box_h만을 사용하여 $ exp{(t_w)} $ 를 계산한다. 연산을 위해 동일한 차원으로 만들어줘야 해서 anchors[:, None]을 통해 3차원으로 만든다. 이 때, t_w는 predict_w, 즉 예측한 박스의 w를 의미한다.
j : w,h 중에서 r과 1/r 중 큰 값을 저장하고, w와 h와 비교해서 큰 값을 추출하고, 너무 큰 값들은 제거한다. 그렇게 되면 True, False의 bool로 이루어진 list가 되서 True인 것들에 대한 것들만 추출한다.
이렇게 추출된 t는 적절한 anchor 사이즈를 갖는 데이터들로만 필터링하게 된다.

batch, cls : t는 7dim을 가지는 변수이고, 그 중 0 index는 batch, 1 index는 class이므로 [:, :2] 로 작성한다. 그 값들을 int값으로 변환하기 위해 long을 붙이고, 열별로 되어 있던 데이터를 1열로 만들기 위해 transpose했다.

  t[:,:2] : 
   tensor([[0., 0.],
      [0., 0.],
      [1., 0.],
      [1., 0.],
      [1., 0.],
      [0., 0.],
      [0., 0.],
      [1., 0.],
      [1., 0.],
      [0., 0.],
      [0., 0.],
      [1., 0.],
      [1., 0.]])
  batch, class : torch.Size([13]), torch.Size([13]) 

indices : [batch index, anchor index, Cy, Cx], Cy,Cx는 위의 그림에서 Cy,Cx를 의미하는데, 이는 몇번째 grid인지에 대한 상수이다. 만약 gt_xy가 17.2, 17.3를 가진다면 17x17의 index를 가지게 되고, Cx,Cy = 17이 된다. 이 때, clamp 메서드를 사용했는데, 이는 값의 범위를 지정해서 그 이하 또는 이상의 값이 되지 않도록 필터링해준다.
tboxes : 필터링해서 유의미한 타겟에 대한 박스만을 넣어준다. 이 때, gt_xy - gt_ij를 하는 이유는 위의 그림에서 볼 수 있듯이 tx,ty는 feature map의 절대 위치가 아닌 한 그리드 안에서의 위치 좌표를 나타낸다. 따라서 gt_ij가 Cx,Cy를 나타내므로 이를 빼주는 방식으로 0~1 값을 가질 수 있게 된다.
anch : 유의미한 타겟에 대한 박스라고 생각한 값들에 사용한 anchor값들을 anch에 삽입
tcls : 위에서 이미 positive라고 생각한 타겟에 대한 리스트로 t를 필터링했으므로 이에 대한 cls들만 집어넣어주면 된다.

get_targets를 통해 한 yololayer당 num_targets의 개수만큼의, 즉 1개의 객체당 1개의 anchor박스만을 가지게 만들었다.

  
def compute_loss(self, pred, targets, yololayer):
        # loss_class, loss_box, loss_objectness
        lcls, lbox, lobj = torch.zeros(1, device = self.device), torch.zeros(1, device = self.device), torch.zeros(1, device = self.device) 

        tcls, tbox, tindices, tanchors = self.get_targets(pred, targets, yololayer)
        
        for pidx, pout in enumerate(pred):
            batch_id, anchor_id, gy, gx = tindices[pidx]
            # objectness information
            tobj = torch.zeros_like(pout[...,0], device=self.device)

            num_targets = batch_id.shape[0] # number of object in the batch size

            if num_targets:
                # pout shape : [batch, anchor, grid_h, grid_w, box_attrib]
                # get the only box_attrib information in grid, so then we can know batch index, anchor index
                ba = pout[batch_id, anchor_id, gy, gx]
                pred_xy = torch.sigmoid(ba[...,0:2]) 
                pred_wh = torch.exp(ba[...,2:4]) * tanchors[pidx]
                pred_box = torch.cat((pred_xy, pred_wh),dim=1) # pred_x,pred_y,pred_w,pred_h

                # iou
                iou = bbox_iou(pred_box, tbox[pidx], xyxy=False) # can get iou about each box 

                # box loss
                ## MSE(Mean Squared loss)
                # mse_loss_wh = self.mseloss(pred_box[...,2:4], tbox[pidx][...,2:4]])
                # mse_loss_xy = self.mseloss(pred_box[...,0:2], tbox[pidx][...,0:2]])
                # print("MSE loss_xy : {}, loss_wh : {}".format(mse_loss_wh, mse_loss_xy)) 

                lbox += (1 - iou).mean() # iou의 평균값들, 3 layer가 다 더해지도록


                # class loss
                if ba.size(1) - 5 > 1: # xywh, obj_info, cls_info
                    t = torch.zeros_like(ba[...,5:], device = self.device)
                    # one hot encoding for the corresponding class
                    # if the information is for the 0th class, insert 1 into 0 index
                    t[range[num_targets], tcls[pidx]] = 1 

                    # compute probability(ba[:,5:]) about class and list for 0 of non correct or 1 of correct (t) and sum
                    lcls += self.bcelogloss(ba[:,5:],t)


                # objectness loss
                # gt box and prediction box are coincide -> positive = 1, negative = 0
                # instead of dividing into 0 or 1, insert as a value between 0 and 1 
                tobj[batch_id, anchor_id, gy, gx] = iou.detach().clamp(0).type(tobj.dtype)


            # we can get also objectness loss, even if num_target is 0
            lobj += self.bcelogloss(pout[...,4], tobj)
                
        # assign loss weight, to set balence for each loss
        lcls *= 0.05
        lobj *= 1.0
        lbox *= 0.5

        total_loss = lcls + lbox + lobj
        
        # define the loss graph visualization
        loss_list = [total_loss.item(), lcls.item(), lobj.item(), lbox.item()]

        return total_loss, loss_list

lcls, lbox, lobj : 추후에 loss를 계산하는데 사용될 리스트들을 미리 메모리 할당
for문 : 예측한 모든 pred, yolo layer에서의 출력값들을 루프를 돌면서 19x19, 38x38, 76x76 feature map에서의 예측값들에 대해 모든 boxes인 22743개를 다 보는 것이 아닌 방금 구했던 positive라고 판단되는 box들만 본다. tcls, tbox 등에 들어있는 것들은 len() = 3을 가진 각각의 feature map에서의 positive box에 대한 정보가 담겨져 있다.
tobj : objectness에 대한 값이므로 마지막 shape을 1로 고정시킨 상태로 생성했다. tobj의 shape을 보면 [2, 3, 19, 19]의 크기를 가진다. 0 대신 :5를 하면 [2, 3, 19, 19, 5]가 될 것이다.
num_targets : batch_id는 num_targets만큼 길이를 가지고 있다. (왜 batch_id가 num_targets 길이를 가지고 있는가)
ba : 해당 grid안에 특정 anchor index, batch index를 가진 box의 속성만을 가져온다. shape는 [num_targets, 13]이다. 즉 각 target에 대한 box_attrib를 가져온다. box attribution[13]에는 bbox[4] + objectness score[1] + classes score[8]로 구성되어 있다.
pred_xy, pred_wh : figure2에서처럼 box_x, box_y는 sigmoid 처리를 하고, box_w, box_h는 exp처리를 하고, anchor box 크기를 곱해준다. (pred_xy에 Cx, Cy는 왜 안곱해주는거지)

  
# box_a, box_b IOU
# xyxy is value for whether or not boxes are [minx,miny,maxx,maxy]
# when we divided by 0 eps use to prevent error
def bbox_iou(box1, box2, xyxy=False, eps = 1e-9):
    box1 = box1.T
    box2 = box2.T 

    if xyxy:
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0],box1[1],box1[2],box1[3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0],box2[1],box2[2],box2[3]
    else:
        b1_x1, b1_y1 = box1[0] - box1[2] / 2, box1[1] - box1[3] / 2
        b1_x2, b1_y2 = box1[0] + box1[2] / 2, box1[1] + box1[3] / 2
        b2_x1, b2_y1 = box2[0] - box2[2] / 2, box2[1] - box2[3] / 2
        b2_x2, b2_y2 = box2[0] + box2[2] / 2, box2[1] + box2[3] / 2

    # intersection
    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
        (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
    
    # union
    b1_w, b1_h = b1_x2 - b1_x1, b1_y2 - b1_y1 * eps
    b2_w, b2_h = b2_x2 - b2_x1, b2_y2 - b2_y1 * eps

    # get two area and subtract intersection once.
    union = b1_w * b1_h + b2_w * b2_h - inter * eps

    # IOU
    iou = inter / union

    return iou

bbox의 iou를 구하는 함수를 선언했다. 여기서 xyxy는 bbox의 포맷이 [minx, miny, maxx, maxy]인지아닌지에 대한 값이다. 그리고 eps, epsilon은 나눗셈을 할 때, 0으로 나누게 되면 에러가 생기게 되므로 이를 방지하기 위해 만든 매우 작은 상수이다.

이 때, 전치를 하지 않을 경우에는 [20, 4]의 shape을 가진다. 이 상태로 계산을 하려면 아래의 식들을 수정해줘야 한다. 전치의 의미는 단지 계산의 과정의 차이이다.

center_x, center_y, w, h 에 대해 box의 min x, min y, max x, max y 를 구하고, 그것들을 통해 교집합을 구한다. 영역의 교집합이므로 특정 상수가 나오는 것이 아닌 공간의 크기가 교집합이 되고, 이를 합집합 영역의 크기와 나누게 되면 iou를 구할 수 있게 된다.

다시 compute_loss로 돌아가서 살펴보도록 하자.

iou : 두 박스를 통해 IOU를 구한다. pred_box는 한 yololayer당 객체 개수만큼의 box를 가지고 있고, tbox[pidx] 또한 동일한 크기의 shape을 가지고 있다.
mseloss : bbox에 대한 loss를 구할 때, MSEloss를 사용하게 되면, 두 박스의 제곱의 차로 구하기 때문에 값이 많이 튈 수 있다. 해당 yololayer index에서의 필터링한 tbox에서의 x,y,w,h 각각과 predicted bbox의 x,y,w,h를 비교한다.
lbox : iou는 num_targets와 같은 길이를 가진다. 즉, 각 object 1개마다의 box들의 iou를 구한다. iou의 값들은 항상 0~1사이의 값을 가진다. 그래서 1-iou의 평균으로 loss를 구했다.
class loss : ba에는 bbox[4] + objectness score[1] + classes score[8]가 있으므로 예외 처리를 위해, 즉 classes score가 있을 경우에는 classes score에 대한 loss를 구한다. class에 대한 shape에 대해서만 연산할 것이므로 ba[…, 5:]에 대해서만 메모리 공간을 생성했다.

t : 해당 target class에 맞는 위치의 index에만 1로 바꾸는 one hot encoding 방식이다. 즉, 총 num_targets의 개수가 있고, 1개의 targets 공간마다 target class에 대한 곳에만 1로 만든다.

  tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0.],
          [0., 0., 0., 1., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0., 0., 0., 0.]])

총 8개의 class가 있고, 방금 필터링한 tcls가 있는데, 이 class의 index에만 1을 부여하는 식이다.

lcls : 구한 one-hot 벡터와 예측한 각 class에 대한 확률값을 통해 bcelogitloss를 사용해서 lcls를 구한다.
lobj : 각 object에 대해 0 또는 1의 값으로 loss를 구할 수 있지만, 조금 더 정확한 연산을 위해 iou 값을 집어넣어준다. 그리고 이것과 pred의 4index, objectness score과 BCE loss를 사용하여 loss를 구한다.
loss’s weight : 모든 loss들에 동등한 크기로 계산을 할 수 있지만, 밸런스를 맞게 조정하고, 더 정확한 계산을 위해서는 각 loss에 weight를 부여해야 한다.

이렇게 구해진 최종 loss를 리턴하는데, 추후에 loss graph를 그릴 때 사용하기 위해 loss list에 total_loss, lcls, lobj, lbox를 넣어서 리턴한다.

최종 train

  
# yolov3.py
from tensorboardX import SummaryWriter
...
    torchwriter = SummaryWriter("./output/tensorboard")

시각화하기 위해 사용하는 함수로 tensorboard를 사용할 것이다. 이를 위해 미리 선언해주고, trainer의 인자로 넣어준다.

  
import os, sys
import torch
import torch.optim as optim

from util.tools import *
from loss.loss import *

class Trainer:
    def __init__(self, model, train_loader, eval_loader, hyparam, device, torchwriter):
        self.model = model
        self.train_loader = train_loader
        self.eval_loader = eval_loader
        self.max_batch = hyparam['max_batch']
        self.device = device
        self.epoch = 0
        self.iter = 0
        self.yololoss = Yololoss(self.device, self.model.n_classes)
        self.optimizer = optim.SGD(model.parameters(), lr=hyparam['lr'], momentum=hyparam['momentum'])

        self.scheduler_multistep = optim.lr_scheduler.MultiStepLR(self.optimizer, 
                                                             milestones=[20,40,60],
                                                             gamma = 0.5)            # 학습을 진행할 때마다 lr이 떨어져야 더 정교하게 학습이 가능하다. 떨어지는 빈도를 multisteplr로 설정
        self.torchwriter = torchwriter

    def run_iter(self):
        for i, batch in enumerate(self.train_loader):
            # drop the batch when invalid values
            if batch is None:
                continue

            input_img, targets, anno_path = batch
            print("input {} {}".format(input_img.shape, targets.shape)) # [batch, C, H, W], [object number, (batchidx, cls_id, box_attirb)]
            input_img = input_img.to(self.device, non_blocking=True) # non_blocking

            output = self.model(input_img)
            # print(input_img.shape, targets.shape)
            # print(output[0].shape)

            # get loss between output and target(gt)
            loss, loss_list = self.yololoss.compute_loss(output, targets, self.model.yolo_layers)

            loss.backward()
            self.optimizer.step()       # gradient가 weight에 반영해서 update
            self.optimizer.zero_grad()  
            self.scheduler_multistep.step(self.iter) # step마다 lr을 줄임
            self.iter += 1

            # [total_loss.item(), lcls.item(), lobj.item(), lbox.item()]
            loss_name = ['total_loss', 'cls_loss', 'obj_loss', 'box_loss']

            if i % 10 == 0 :
                print("epoch {} / iter {} lr {} loss {}".format(self.epoch, self.iter, get_lr(self.optimizer), loss.item()))
                self.torchwriter.add_scalar('lr', get_lr(self.optimizer), self.iter)
                self.torchwriter.add_scalar('total_loss', loss, self.iter)
                for ln, lv in zip(loss_name, loss_list):
                    self.torchwriter.add_scalar(ln,lv,self.iter)

        return loss

최종적으로 계산한 loss를 역전파를 수행하고, 최적화를 사용하여 weight에 반영하여 업데이트되도록 만든다. 그리고 scheduler를 통해 learning rate를 조정한다. 이를 조정하는 이유는 학습이 진행될수록 learning rate가 작아져야 학습이 더 잘 되기 때문에 이를 조정하기 위해 사용한다.

graph를 그리기 위해 torchwriter.add_scalar를 사용하여 학습된 값들을 저장한다. 이때, learning rate를 저장하기 위해서는 optimizer에서 특정 코드를 통해 빼내야 한다. 이는 아래에 추가해놓았다. 단순하게 optimizer.param_groups에서 [‘lr’]을 추출하는 것이다.

  
# get learning rate in optimizer 
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

  
    def run(self):
        while True:
            # train
            self.model.train()
            loss = self.run_iter()

            # save model 
            checkpoint_path = os.path.join("./output","model_epoch"+str(self.epoch)+".pth")
            torch.save({'epoch': self.epoch,
                        'iteration': self.iter,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss':loss}, 
                        checkpoint_path)
            

            self.epoch += 1

            if self.epoch == self.max_batch:
                break

run_iter는 1epoch안에서 train_loader 크기만큼 학습하는 과정이고, 1 epoch이 다 돌아가면 checkpoint, model parameter들을 저장한다. torch.save에 model로만 작성하면 model 그대로 저장되지만, 이처럼 딕셔너리로 저장하고 싶은 부분만 저장할 수가 있다.

최종 코드

yolov3.py

  
import torch
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader

import argparse

import os, sys, time
from dataloader.yolo_data import Yolodata

from util.tools import *
from dataloader.data_transforms import *
from model.yolov3 import * 
from train.train import * 

from tensorboardX import SummaryWriter

def parse_args():
    parser = argparse.ArgumentParser(description="YOLOV3_PYTORCH arguments")
    parser.add_argument("--gpus", type=int, nargs='+', 
                        help="List of GPU device id", default=[])
    parser.add_argument("--mode", type=str, 
                        help="train / eval / test", default=None)
    parser.add_argument("--cfg", type=str,
                        help="model config path", default=None)
    parser.add_argument("--checkpoint", type=str,
                        help="model checkpoint path", default=None)
    parser.add_argument("--download", type=bool,
                        help="download the dataset", default=False)
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()
    return args

def collate_fn(batch):
    # only use valid data
    batch = [data for data in batch if data is not None] 
    # skip invalid data
    if len(batch) == 0:
        return
    
    imgs, targets, anno_path = list(zip(*batch))
    imgs = torch.stack([img for img in imgs]) # mk 3dim -> 4dim, 0index = batch
    for i, boxes in enumerate(targets):
        # insert 0 index of box of dataloader function, instead of zero 
        boxes[:,0] = i
        #print(boxes.shape)
    targets = torch.cat(targets,0)

    return imgs, targets, anno_path




''' train '''
def train(cfg_param = None, using_gpus = None):
    print("train")
    
    my_transform = get_transformations(cfg_param=cfg_param, is_train=True)

    # dataloader
    train_dataset = Yolodata(is_train=True, 
                             transform=my_transform, 
                             cfg_param=cfg_param)
    
    train_loader = DataLoader(train_dataset, 
                              batch_size=cfg_param['batch'],
                              num_workers = 0,          # num_worker : cpu와 gpu의 데이터 교류를 담당함. 0이면 default로 single process와 같이 진행, 0이상이면 multi thred
                              pin_memory = True,        # pin_memory : img나 데이터 array를 gpu로 올릴 때 memory의 위치를 고정시킨건지 할당할건지말지에 대한 것
                              drop_last = True,
                              shuffle = True,
                              collate_fn = collate_fn)  # collate_fn : batch size로 getitem할 때 각각의 이미지에 대해서만 가져온다. 그러나 학습을 할 떄는 batch 단위로 만들어줘야 하기 때문에 이를 collate fn으로 진행

    model = Darknet53(args.cfg, cfg_param, training=True)

    model.train()
    model.initialize_weights()

    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    model = model.to(device)

    torchwriter = SummaryWriter("./output/tensorboard")

    train = Trainer(model = model, train_loader = train_loader, eval_loader=None, hyparam=cfg_param, device = device, torchwriter = torchwriter)
    train.run()

    







''' eval '''
def eval(cfg_param = None, using_gpus = None):
    print("eval")

''' test '''
def test(cfg_param = None, using_gpus = None):
    print("test")


''' main '''
if __name__ == "__main__":
    args = parse_args()
    
    if args.download == True:
        os.system("./install_dataset.sh")

    # print config file
    net_data, conv_data = parse_hyperparam_config(args.cfg)
    
    cfg_param = get_hyperparam(net_data)


    usingf_gpus = [int(g) for g in args.gpus]

    if args.mode == "train":
        train(cfg_param = cfg_param)
    elif args.mode == "eval":
        eval(cfg_param = cfg_param)
    elif args.mode == "test":
        test(cfg_param = cfg_param)
    else:
        print("unknown mode")

    

models/yolov3.py

  
import enum
import os, sys
import numpy as np
import torch
import torch.nn as nn

from util.tools import *

def make_conv_layer(layer_idx : int, modules : nn.Module, layer_info : dict, in_channels : int):
    filters = int(layer_info['filters']) # output channel size
    size = int(layer_info['size']) # kernel size
    stride = int(layer_info['stride'])
    pad = (size - 1) // 2 # layer_info['pad']
    modules.add_module('layer_'+str(layer_idx)+'_conv',
                        nn.Conv2d(in_channels, filters, size, stride, pad))

    if layer_info['batch_normalize'] == '1':
        modules.add_module('layer_'+str(layer_idx)+'_bn',
                        nn.BatchNorm2d(filters))

    if layer_info['activation'] == 'leaky':
        modules.add_module('layer_'+str(layer_idx)+'_act',
                        nn.LeakyReLU())
    elif layer_info['activation'] == 'relu':
        modules.add_module('layer_'+str(layer_idx)+'_act',
                        nn.ReLU())

def make_shortcut_layer(layer_idx : int, modules : nn.Module):
    modules.add_module('layer_'+str(layer_idx)+"_shortcut", nn.Identity()) # modulelist에서 info 타입이 맞지 않으면 복잡해지므로 빈 공간으로 init

def make_route_layer(layer_idx : int, modules : nn.Module):
    modules.add_module('layer_'+str(layer_idx)+"_route", nn.Identity())

def make_upsample_layer(layer_idx : int, modules : nn.Module, layer_info : dict):
    stride = int(layer_info['stride'])
    modules.add_module('layer_'+str(layer_idx)+'_upsample',
                        nn.Upsample(scale_factor=stride, mode='nearest'))


class Yololayer(nn.Module):
    def __init__(self, layer_info : dict, in_width : int, in_height : int, is_train : bool):
        super(Yololayer, self).__init__()
        self.n_classes = int(layer_info['classes'])
        self.ignore_thresh = float(layer_info['ignore_thresh'])  # loss 계산시 해당 박스가 특정 값 이상일 때만 연산에 포함되도록
        self.box_attr = self.n_classes + 5                     # output channel = box[4] + objectness[1] + class_prob[n]
        mask_idxes = [int(x) for x in layer_info['mask'].split(',')] # cfg파일에서 mask의 역할은 anchor가 총 9개 선언되어 잇는데, 각각의 yololayer에서 어떤 anchor를 사용할지에 대한 index이다. 0,1,2이면 0,1,2index의 anchor를 사용한다는 뜻
        anchor_all = [int(x) for x in layer_info['anchors'].split(',')] # w1,h1 , w2,h2 , w3,h3 , ... 로 되어 있으므로 이것을 다시 w,h 를 묶어줘야 한다.
        anchor_all = [(anchor_all[i], anchor_all[i+1]) for i in range(0, len(anchor_all), 2)]
        self.anchor = torch.tensor([anchor_all[x] for x in mask_idxes])
        self.in_width = in_width
        self.in_height = in_height
        self.stride = None # feature map의 1 grid가 차지하는 픽셀의 값 == n x n
        self.lw = None
        self.lh = None
        self.is_train = is_train

    def forward(self, x): # bounding box를 뽑을 수 있게 sigmoid나 exponantional을 취해줌
        # x is input. [N C H W]
        self.lw, self.lh = x.shape[3], x.shape[2] # feature map's width, height
        self.anchor = self.anchor.to(x.device) # 연산을 할 때 동일한 곳에 올라가 있어야함, cpu input이라면 cpu에, gpu input이라면 gpu에
        self.stride = torch.tensor([torch.div(self.in_width, self.lw, rounding_mode = 'floor'), 
                                    torch.div(self.in_height, self.lh, rounding_mode = 'floor')]).to(x.device) # stride = input size / feature map size
        
        # if kitti data, n_classes = 8, C = (8 + 5) * 3 = 39, yolo layer 이전의 filters 즉 output channels을 보면 다 39인 것을 확인할 수 있다.
        # [batch, box_attrib * anchor, lh, lw] ex) [1,39,19,19]
        # 4dim -> 5dim [batch, anchor, lh, lw, box_attrib]
        x = x.view(-1, self.anchor.shape[0], self.box_attr, self.lh, self.lw).permute(0,1,3,4,2).contiguous() # permute를 통해 dimension 순서를 변경, configuouse를 해야 바뀐채로 진행됨
        return x


class Darknet53(nn.Module):
    def __init__(self, cfg, param, training):
        super().__init__()
        self.batch = int(param['batch'])
        self.in_channels = int(param['in_channels'])
        self.in_width = int(param['in_width'])
        self.in_height = int(param['in_height'])
        self.n_classes = int(param['classes'])
        self.module_cfg = parse_model_config(cfg)
        self.module_list = self.set_layer(self.module_cfg)
        self.yolo_layers = [layer[0]for layer in self.module_list if isinstance(layer[0], Yololayer)]
        self.training = training

    def set_layer(self, layer_info): # init layer setting
        module_list = nn.ModuleList()
        in_channels = [self.in_channels] # first channels of input
        for layer_idx, info in enumerate(layer_info):
            modules = nn.Sequential()
            if info['type'] == 'convolutional':
                make_conv_layer(layer_idx, modules, info, in_channels[-1])
                in_channels.append(int(info['filters'])) # store each module's input channels
            elif info['type'] == 'shortcut':
                make_shortcut_layer(layer_idx, modules)
                in_channels.append(in_channels[-1])
            elif info['type'] == 'route':
                make_route_layer(layer_idx, modules)
                layers = [int(y) for y in info['layers'].split(',')]
                if len(layers) == 1:
                    in_channels.append(in_channels[layers[0]])
                elif len(layers) == 2:
                    in_channels.append(in_channels[layers[0]] + in_channels[layers[1]])
                
            elif info['type'] == 'upsample':
                make_upsample_layer(layer_idx, modules, info)
                in_channels.append(in_channels[-1]) # width, height만 커지므로 channel은 동일

            elif info['type'] == 'yolo':
                yololayer = Yololayer(info, self.in_width, self.in_height, self.training)
                modules.add_module('layer_'+str(layer_idx)+'_yolo', yololayer)
                in_channels.append(in_channels[-1])
            
            module_list.append(modules)
        return module_list

    def initialize_weights(self):
        # track all layers
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight) # weight initializing

                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)  # scale
                nn.init.constant_(m.bias, 0)    # shift
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)
        
    


    def forward(self, x):
        yolo_result = [] # 최종 output, 마지막 layer가 yolo이므로
        layer_result = [] # shortcut, route에서 사용하기 위해 저장

        for idx, (name, layer) in enumerate(zip(self.module_cfg, self.module_list)):
            if name['type'] == 'convolutional':
                x = layer(x)
                layer_result.append(x)
            elif name['type'] == 'shortcut':
                x = x + layer_result[int(name['from'])]
                layer_result.append(x)
            elif name['type'] == 'yolo':
                yolo_x = layer(x)
                layer_result.append(yolo_x)
                yolo_result.append(yolo_x)
            elif name['type'] == 'upsample':
                x = layer(x)
                layer_result.append(x)
            elif name['type'] == 'route':
                layers = [int(y) for y in name['layers'].split(',')]
                x = torch.cat([layer_result[l] for l in layers], dim=1)
                layer_result.append(x)
            #print("idx : {}, result : {}".format(idx, layer_result[-1].shape))
        return yolo_result

train/train.py

  
import os, sys
import torch
import torch.optim as optim

from util.tools import *
from loss.loss import *

class Trainer:
    def __init__(self, model, train_loader, eval_loader, hyparam, device, torchwriter):
        self.model = model
        self.train_loader = train_loader
        self.eval_loader = eval_loader
        self.max_batch = hyparam['max_batch']
        self.device = device
        self.epoch = 0
        self.iter = 0
        self.yololoss = Yololoss(self.device, self.model.n_classes)
        self.optimizer = optim.SGD(model.parameters(), lr=hyparam['lr'], momentum=hyparam['momentum'])

        self.scheduler_multistep = optim.lr_scheduler.MultiStepLR(self.optimizer, 
                                                             milestones=[20,40,60],
                                                             gamma = 0.5)            # 학습을 진행할 때마다 lr이 떨어져야 더 정교하게 학습이 가능하다. 떨어지는 빈도를 multisteplr로 설정
        self.torchwriter = torchwriter

    def run_iter(self):
        for i, batch in enumerate(self.train_loader):
            # drop the batch when invalid values
            if batch is None:
                continue

            input_img, targets, anno_path = batch
            #print("input {} {}".format(input_img.shape, targets.shape)) # [batch, C, H, W], [object number, (batchidx, cls_id, box_attirb)]
            input_img = input_img.to(self.device, non_blocking=True) # non_blocking

            output = self.model(input_img)

            # get loss between output and target(gt)
            loss, loss_list = self.yololoss.compute_loss(output, targets, self.model.yolo_layers)

            loss.backward()
            self.optimizer.step()       # gradient가 weight에 반영해서 update
            self.optimizer.zero_grad()  
            self.scheduler_multistep.step(self.iter) # step마다 lr을 줄임
            self.iter += 1

            # [total_loss.item(), lcls.item(), lobj.item(), lbox.item()]
            loss_name = ['total_loss', 'cls_loss', 'obj_loss', 'box_loss']

            if i % 10 == 0 :
                print("epoch {} / iter {} lr {} loss {}".format(self.epoch, self.iter, get_lr(self.optimizer), loss.item()))
                self.torchwriter.add_scalar('lr', get_lr(self.optimizer), self.iter)
                self.torchwriter.add_scalar('total_loss', loss, self.iter)
                for ln, lv in zip(loss_name, loss_list):
                    self.torchwriter.add_scalar(ln,lv,self.iter)

        return loss


    def run(self):
        while True:
            # train
            self.model.train()
            loss = self.run_iter()

            # save model 
            checkpoint_path = os.path.join("./output","model_epoch"+str(self.epoch)+".pth")
            torch.save({'epoch': self.epoch,
                        'iteration': self.iter,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss':loss}, 
                        checkpoint_path)
            
            # evaluation

            self.epoch += 1

            if self.epoch == self.max_batch:
                break

loss/loss.py

  
import torch
import torch.nn as nn
import sys
from util.tools import *


class Yololoss(nn.Module):
    def __init__(self, device, n_class):
        super(Yololoss, self).__init__()
        self.device = device
        self.n_class = n_class
        self.mseloss = nn.MSELoss().to(device) # mean squared entropy
        self.bceloss = nn.BCELoss().to(device) # binary cross entropy
        self.bcelogloss = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.0], device = device)).to(device) # take log for BCE
        
    def compute_loss(self, pred, targets, yololayer):
        lcls, lbox, lobj = torch.zeros(1, device = self.device), torch.zeros(1, device = self.device), torch.zeros(1, device = self.device) # loss_class, loss_box, loss_objectness
        
        tcls, tbox, tindices, tanchors = self.get_targets(pred, targets, yololayer)

        for pidx, pout in enumerate(pred):
            batch_id, anchor_id, gy, gx = tindices[pidx]
            # objectness information
            tobj = torch.zeros_like(pout[...,0], device=self.device)

            num_targets = batch_id.shape[0] # number of object in the batch size

            if num_targets:
                # pout shape : [batch, anchor, grid_h, grid_w, box_attrib]
                # get the only box_attrib information in grid, so then we can know batch index, anchor index
                ba = pout[batch_id, anchor_id, gy, gx]

                pred_xy = torch.sigmoid(ba[...,0:2]) 
                pred_wh = torch.exp(ba[...,2:4]) * tanchors[pidx]
                pred_box = torch.cat((pred_xy, pred_wh),dim=1) # pred_x,pred_y,pred_w,pred_h
                # print(pred_box.shape, tbox[pidx].shape)

                # iou
                iou = bbox_iou(pred_box, tbox[pidx], xyxy=False) # can get iou about each box 

                # box loss              
                lbox += (1 - iou).mean() # iou의 평균값들, 3 layer가 다 더해지도록

                # objectness loss
                # gt box and prediction box are coincide -> positive = 1, negative = 0
                # instead of dividing into 0 or 1, insert as a value between 0 and 1 
                tobj[batch_id, anchor_id, gy, gx] = iou.detach().clamp(0).type(tobj.dtype)


                # class loss
                if ba.size(1) - 5 > 1: # xywh, obj_info, cls_info 
                    t = torch.zeros_like(ba[...,5:], device = self.device)
                    # one hot encoding for the corresponding class
                    # if the information is for the 0th class, insert 1 into 0 index
                    t[range(num_targets), tcls[pidx]] = 1 
                    print(t)

                    # compute probability(ba[:,5:]) about class and list for 0 of non correct or 1 of correct (t) and sum
                    lcls += self.bcelogloss(ba[:,5:],t)

            # we can get also objectness loss, even if num_target is 0
            lobj += self.bcelogloss(pout[...,4], tobj)
                
        # assign loss weight, to set balence for each loss
        lcls *= 0.05
        lobj *= 1.0
        lbox *= 0.5

        total_loss = lcls + lbox + lobj
        
        # define the loss graph visualization
        loss_list = [total_loss.item(), lcls.item(), lobj.item(), lbox.item()]

        return total_loss, loss_list



    # for comparing prediction and gt conveniently, we transpose shape
    def get_targets(self, pred, targets, yololayer):
        num_anch = 3
        num_targets = targets.shape[0] # batch size
        tcls, tboxes, tindices, anch = [], [], [], [] # output, target_class, target_box, index, anchor

        gain = torch.ones(7, device=self.device) # targets is to be 7 dim, [b_id, c_id, cx,cy,w,h,a_id]

        # anchor index
        # ai.shape = (1x3) => 3x1, and repeat targets's num
        ai = torch.arange(num_anch, device=targets.device).float().view(num_anch, 1).repeat(1, num_targets)
        # to make targets to be anchor's number, targets.shape multiple anchor's num(3)

        targets = torch.cat((targets.repeat(num_anch, 1, 1), ai[:,:,None]), dim=2)


        for yi, yl in enumerate(yololayer):
            # 각 yolo layer feature map에 맞게 설정
            # cfg 파일에서의 anchors는 608에 대한 값, 19x19, 38x38에 대한 값으로 만들어줘야 함
            anchors = yl.anchor / yl.stride 
 
            gain[2:6] = torch.tensor(pred[yi].shape)[[3,2,3,2]] # [1,1,grid_w, grid_h, grid_w, grid_h,1]

            # multiple [box_cx, box_cy,box_w,box_y] * grid size, to unpack normalize
            t = targets * gain
            # print(t) # targets's[2:6] is to be some number dependent on grid size


            if num_targets:
                # in figure2 of yolov3 paper, w, h of bounding box is anchor size * exp(prediction's w) or exp(prediction's h)
                # so, r = exp(prediction_w) = box_w / anchor_w
                r = t[:,:,4:6] / anchors[:, None]

                # extract maximum exp(prediction_w)
                # select the ratios less than 4, remove the too large ratios
                # print(r)
                j = torch.max(r, 1. / r).max(dim = 2)[0] < 4
                # print("max : ", torch.max(r, 1. / r).max(dim = 2)[0])
                # print(j)

                t = t[j] # extract value for true
            else: # num_targets == 0
                t = targets[0]

            # batch_id, class_id with integer and transpose
            batch, cls = t[:,:2].long().T
            # print("batch, class", batch.shape, cls.shape, "\n", t[:, :2].shape, t.shape)

            gt_xy = t[:, 2:4]
            gt_wh = t[:, 4:6]

            # define the Cx, Cy in figure2. Cx Cy is index of grid
            # if in 19x19 gt_xy is 17.2,17.3, Cx Cy about object is 17,17
            gt_ij = gt_xy.long() # make integer from float type
            gt_i, gt_j = gt_ij.T # make 1 row, many col
            # print(gt_ij.shape, gt_i.shape, gt_j.shape)

            # anchor index
            a = t[:, 6].long()

            # add indices
            # clamp() : 19x19 이상의 값이 되지 않기 위해
            # always 0 < gt_j < grid_h -1 
            tindices.append((batch, a, gt_j.clamp(0, gain[3]-1), gt_i.clamp(0, gain[2]-1))) # [batch id, anchor id, Cy, Cx]

            # add target box
            # prediction_x, prediction_y normalized by sigmoid is box_x - Cx, or box_y - Cy in figure2   
            # shape : [p_x, p_y, gt_w, gt_h]
            tboxes.append(torch.cat((gt_xy-gt_ij, gt_wh), dim=1))

            # add anchor
            # a is index of anchor box to guess positive box, so insert anchor box for indices
            anch.append(anchors[a])

            # add class
            tcls.append(cls)

        return tcls, tboxes, tindices, anch

dataloader/data_transforms.py

  
import numpy as np
import cv2
import torch
import torchvision.transforms as transforms

import imgaug as ia
from imgaug import augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

from util.tools import xywh2xyxy_np


def get_transformations(cfg_param = None, is_train = None):
    if is_train:
        data_transform = transforms.Compose([AbsoluteLabels(),
                                             DefaultAug(),
                                             RelativeLabels(),
                                             ResizeImage(new_size = (cfg_param['in_width'], cfg_param['in_height'])),
                                             ToTensor(),
                                            ])
    else:
        data_transform = transforms.Compose([AbsoluteLabels(),
                                             DefaultAug(),
                                             RelativeLabels(),
                                             ResizeImage(new_size = (cfg_param['in_width'], cfg_param['in_height'])),
                                             ToTensor(),
                                            ])

    return data_transform


# absolute bbox, 현재는 이미지 크기에 따른 0~1값을 가지지만, 이것을 절대값으로 가지고 있어야 transform을 해도 정보를 잃지 않는다.
class AbsoluteLabels(object):
    def __init__(self,):
        pass

    def __call__(self, data): # yolodata코드에서 transform이 들어갈 때 (img, bbox)가 data로 들어옴
        img, label = data
        h, w , _ = img.shape
        label[:,[1, 3]] *= w # cx, w *= w
        label[:,[2, 4]] *= h # cy, h *= h

        return img, label

# relative bbox
class RelativeLabels(object):
    def __init__(self,):
        pass

    def __call__(self, data):
        img, label = data
        h, w, _ = img.shape
        label[:,[1,3]] /= w
        label[:,[2,4]] /= h
        return img, label

class ResizeImage(object):
    def __init__(self, new_size, interpolation = cv2.INTER_LINEAR): # interpolation은 보간으로 이미지를 변환할 때 빈공간을 어떻게 처리할지
        self.new_size = tuple(new_size)
        self.interpolation = interpolation

    def __call__(self, data):
        img, label = data
        img = cv2.resize(img, self.new_size, interpolation=self.interpolation)
        return img, label
        # label은 normalize된 값이므로 resize하지 않아도 된다. 나중에 width, height를 곱하면 resize된 label로 만들어질 것이다.

class ToTensor(object):
    def __init__(self,):
        pass
    def __call__(self, data):
        img, label = data
        img = torch.tensor(np.transpose(np.array(img, dtype=float) / 255, (2,0,1)), dtype=torch.float32) # normalize, transpose HWC to CHW
        label = torch.FloatTensor(np.array(label))

        return img, label



# augmentation template, 앞으로 다른 augmentation을 사용할 때 이 template을 상속받아서 구현할 것이다. 공통적으로 augmentation을 할 때마다 bbox가 augmentation방식에 따라 값이 변해야 하므로
class ImgAug(object):
    def __init__(self, augmentations=[]):
        self.augmentations = augmentations
    
    def __call__(self, data):
        # unpack data
        img, labels = data
        # convert xywh to minx,miny,maxx,maxy because convenient and the imgAug format
        boxes = np.array(labels)
        boxes[:,1:] = xywh2xyxy_np(boxes[:,1:]) #0번째는 cls 정보이므로  

        # convert bbox to imgaug format
        bounding_boxes = BoundingBoxesOnImage(
                                [BoundingBox(*box[1:], label=box[0]) for box in boxes],
                                shape=img.shape)

        #apply augmentation
        img, bounding_boxes = self.augmentations(image=img,
                                                 bounding_boxes=bounding_boxes)

        # 예외 처리, 이미지 밖으로 나가는 bounding box를 제거
        bounding_boxes = bounding_boxes.clip_out_of_image()

        # convert bounding boxes to np.array()
        boxes = np.zeros((len(bounding_boxes), 5)) # memory assignment
        for box_idx, box in enumerate(bounding_boxes):
            x1, y1, x2, y2 = box.x1, box.y1, box.x2, box.y2 # x1,y1,x2,y2 멤버 변수를 가지고 있음

            # return [x, y, w, h], 원래의 포맷은 xywh이므로 다시 변환
            boxes[box_idx, 0] = box.label
            boxes[box_idx, 1] = (x1 + x2) / 2
            boxes[box_idx, 2] = (y1 + y2) / 2
            boxes[box_idx, 3] = x2 - x1
            boxes[box_idx, 4] = y2 - y1

        return img, boxes


class DefaultAug(ImgAug):
    def __init__(self,):
        self.augmentations = iaa.Sequential([
                                    iaa.Sharpen(0.0, 0.1),
                                    iaa.Affine(rotate=(-0,0), translate_percent=(-0.1, 0.1), scale=(0.8, 1.5))
                                    ])

dataloader/yolo_data.py

  
import torch
from torch.utils.data import Dataset
import torchvision

from PIL import Image
import numpy as np

import os, sys

class Yolodata(Dataset): # torch utils data의 dataset을 상속받는다.

    # format path
    file_dir = ''
    anno_dir = ''
    file_txt = ''

    base_dir = 'C:\\Users\\dkssu\\dev\\datasets\\KITTI\\'
    # train dataset path
    train_img = base_dir + 'train\\JPEGimages\\'
    train_txt = base_dir + 'train\\Annotations\\'
    # valud dataset path
    valid_img = base_dir + 'valid\\JPEGimages\\'
    valid_txt = base_dir + 'valid\\Annotations\\'

    class_names = ['Car', 'Van', 'Truck', 'Pedestrian', 'Persion_sitting', 'Cyclist', 'Tram', 'Misc'] # doncare는 x
    num_classes = None
    img_data = []

    def __init__(self, is_train=True, transform=None, cfg_param=None):
        super(Yolodata, self).__init__()
        self.is_train = is_train
        self.transform = transform
        self.num_class = cfg_param['classes']

        if self.is_train:
            self.file_dir = self.train_img
            self.anno_dir = self.train_txt
            self.file_txt = self.base_dir + 'train\\ImageSets\\train.txt'
        else:
            self.file_dir = self.valid_img
            self.anno_dir = self.valid_txt
            self.file_txt = self.base_dir + 'valid\\ImageSets\\valid.txt'

        img_names = []
        img_data = []

        with open(self.file_txt, 'r', encoding='UTF-8', errors='ignore') as f:
            img_names = [i.replace("\n", "") for i in f.readlines()]
            
        for i in img_names:
            if os.path.exists(self.file_dir + i + ".jpg"):
                img_data.append(i + ".jpg")
            elif os.path.exists(self.file_dir + i + ".JPG"):
                img_data.append(i + ".JPG")
            elif os.path.exists(self.file_dir + i + ".png"):
                img_data.append(i + ".png")
            elif os.path.exists(self.file_dir + i + ".PNG"):
                img_data.append(i + ".PNG")

        self.img_data = img_data
        print("data length : {}".format(len(self.img_data)))

    def __getitem__(self, index):
        img_path = self.file_dir + self.img_data[index]

        with open(img_path, 'rb') as f:
            img = np.array(Image.open(img_path).convert('RGB'), dtype=np.uint8)
            img_origin_h, img_origin_w = img.shape[:2] # img shape : [H,W,C]

        # annotation dir이 있는지 확인, txt파일 읽기
        if os.path.isdir(self.anno_dir):
            txt_name = self.img_data[index]
            for ext in ['.png', '.PNG', '.jpg', '.JPG']:
                txt_name = txt_name.replace(ext, '.txt')
            anno_path = self.anno_dir + txt_name

            if not os.path.exists(anno_path):
                return
            
            bbox = []
            with open(anno_path, 'r') as f: # annotation about each image
                for line in f.readlines():
                    line = line.replace("\n",'')
                    gt_data = [l for l in line.split(' ')] # [class, center_x, center_y, width, height]

                    # skip when abnormal data
                    if len(gt_data) < 5:
                        continue

                    cls, cx, cy, w, h = float(gt_data[0]), float(gt_data[1]), float(gt_data[2]), float(gt_data[3]), float(gt_data[4])
                
                    bbox.append([cls, cx, cy, w, h])

            bbox = np.array(bbox)

            # skip empty target
            empty_target = False
            # even if target does not exist, we have to put bbox data
            if bbox.shape[0] == 0:
                empty_target = True
                # bbox의 형태가 객체가 2개일경우 [[a,b,c,d,e],[a,b,c,d,e]] 이므로 형태를 맞추기 위해 [[]]로 생성
                bbox = np.array([[0,0,0,0,0]])
            
            # data augmentation
            if self.transform is not None:
                img, bbox = self.transform((img, bbox))

            # 해당 배치가 몇번째 배치인지 확인하기 위한 index
            if not empty_target:
                batch_idx = torch.zeros(bbox.shape[0]) # 객체 개수만큼 크기를 생성
                target_data = torch.cat((batch_idx.view(-1,1), bbox), dim=1) # x는 1, y는 객체 개수의 array로 만들어줘서 bbox와 concat
            else:
                return
            return img, target_data, anno_path

        else: # test mode
            bbox = np.array([[0,0,0,0,0]])
            if self.transform is not None:
                img, _ = self.transform((img, bbox))
            return img, None, None

    def __len__(self):
        return len(self.img_data)

tools.py

  
from PIL import Image, ImageDraw
import numpy as np
import matplotlib.pyplot as plt

# parse model layer configuration
def parse_model_config(path):
    file = open(path, 'r')
    lines = file.read().split('\n')
    lines = [x for x in lines if x and not x.startswith('#')]
    lines = [x.rstrip().lstrip() for x in lines]

    module_defs = []
    type_name = None
    for line in lines:
        if line.startswith("["):
            type_name = line[1:-1].rstrip()
            if type_name == 'net':
                continue
            module_defs.append({})
            module_defs[-1]['type'] = type_name
            if module_defs[-1]['type'] == 'convolutional':
                module_defs[-1]['batch_normalize'] = 0

        else:
            if type_name == "net":
                continue
            
            key, value = line.split('=')
            value = value.strip()
            module_defs[-1][key.rstrip()] = value.strip()

    return module_defs


# watch parse the yolov3 configuaraion about network
def parse_hyperparam_config(path):
    file = open(path, 'r')
    lines = file.read().split('\n')
    lines = [x for x in lines if x and not x.startswith('#')] # #으로 시작하지 않는 줄만 저장
    lines = [x.rstrip().lstrip() for x in lines]

    # network hyperparameter에 대한 definition
    module_defs = []

    # convolution에 대한 parameter
    conv_defs = []

    for line in lines:
        # layer devision
        if line.startswith("["):
            type_name = line[1:-1].rstrip()
            if type_name == "net": # net은 network의 hyperparameter
                module_defs.append({}) # dictionary
                module_defs[-1]['type'] = type_name
            if type_name == 'convolutional':
                conv_defs.append({})
                conv_defs[-1]['type'] = type_name
        
        else:
            key, value = line.split("=")
            if type_name == 'net':
                module_defs[-1][key.rstrip()] = value.strip()
            if type_name == 'convolutional':
                conv_defs[-1][key.rstrip()] = value.strip()

    return module_defs, conv_defs

# get the data to want to be ours.
def get_hyperparam(data):
    for d in data:
        if d['type'] == 'net':
            batch = int(d['batch'])
            subdivision = int(d['subdivisions'])
            momentum = float(d['momentum'])
            decay = float(d['decay'])
            saturation = float(d['saturation'])
            lr = float(d['learning_rate'])
            burn_in = int(d['burn_in'])
            max_batch = int(d['max_batches'])
            lr_policy = d['policy']
            in_width = int(d['width'])
            in_height = int(d['height'])
            in_channels = int(d['channels'])
            classes = int(d['class'])
            ignore_class = int(d['ignore_cls'])

            return{'batch': batch,
                   'subdivision': subdivision,
                   'momentum': momentum,
                   'decay': decay,
                   'saturation': saturation,
                   'lr': lr,
                   'burn_in': burn_in,
                   'max_batch': max_batch,
                   'lr_policy': lr_policy,
                   'in_width': in_width,
                   'in_height': in_height,
                   'in_channels': in_channels,
                   'classes': classes,
                   'ignore_class': ignore_class}
            
        else:
            continue


def xywh2xyxy_np(x:np.array):
    y = np.zeros_like(x)
    y[...,0] = x[...,0] - x[...,2] / 2 # centerx - w/2 = minx
    y[...,1] = x[...,1] - x[...,3] / 2 # miny
    y[...,2] = x[...,0] + x[...,2] / 2 # maxx
    y[...,3] = x[...,1] + x[...,3] / 2 # maxy

    return y


# box_a, box_b IOU
# xyxy is value for whether or not boxes are [minx,miny,maxx,maxy], eps use when we divided by zero 0 to prevent error
def bbox_iou(box1, box2, xyxy=False, eps = 1e-9):
    box1 = box1.T
    box2 = box2.T

    if xyxy:
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0],box1[1],box1[2],box1[3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0],box2[1],box2[2],box2[3]
    else:
        b1_x1, b1_y1 = box1[0] - box1[2] / 2, box1[1] - box1[3] / 2
        b1_x2, b1_y2 = box1[0] + box1[2] / 2, box1[1] + box1[3] / 2
        b2_x1, b2_y1 = box2[0] - box2[2] / 2, box2[1] - box2[3] / 2
        b2_x2, b2_y2 = box2[0] + box2[2] / 2, box2[1] + box2[3] / 2

    # intersection
    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
        (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
    
    # union
    b1_w, b1_h = b1_x2 - b1_x1, b1_y2 - b1_y1 * eps
    b2_w, b2_h = b2_x2 - b2_x1, b2_y2 - b2_y1 * eps

    # get two area and subtract intersection once.
    union = b1_w * b1_h + b2_w * b2_h - inter * eps

    # IOU
    iou = inter / union

    return iou

# get learning rate in optimizer 
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

[데브코스] 10주차 - DeepLearning Yolo v3 coding (2)