상세 컨텐츠

본문 제목

[Advanced ML & DL Week3] Faster R-CNN Pytorch 구현

심화 스터디/Advanced ML & DL paper review

by 라라밤쥬 2022. 10. 13. 00:35

본문

작성자: 15기 박지우

 

 

0. Summary 

 

Convolutional Layer를 통과하여 feature map를 만든다. 이 Map를 가지고 RPN(region proposal netwrok) 가 ROI , 즉 보아야 할곳을 지정해준다. 그것을 통해서 Fast RCNN은 이미지를 분석한다고 보면 된다.

** Fast RCNN 학습 과정에서 conv features를 공유한다. (RPN/Fast RCNN) 

 

 

1. Datasets

 

Masks play a crucial role in protecting the health of individuals against respiratory diseases, as is one of the few precautions available for COVID-19 in the absence of immunization. With this dataset, it is possible to create a model to detect people wearing masks, not wearing them, or wearing masks improperly.
This dataset contains 853 images belonging to the 3 classes,  as well as their bounding boxes in the PASCAL VOC format.
The classes are:

  • With mask;
  • Without mask;
  • Mask worn incorrectly.

 

2. Pytorch 구현 

 

 

 

1) 라이브러리 불러오기 

 

import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict                                            
import os

import time

import imgaug as ia
from imgaug import augmenters as iaa                                              # image augmentation 

import cv2
from PIL import Image
from torchvision import transforms

import xml.etree.ElementTree as ET                                                   # xml 읽어 오는 라이브러리 

from xml.etree.ElementTree import Element, ElementTree

 

 

2)  xml 함수 정의 

 

def xml_parser(xml_path):
    xml_path = xml_path
    xml = open(xml_path,"r")            # xml 열기 
    tree = ET.parse(xml)             # tree부분 
    root = tree.getroot()            # root 
    size = root.find('size')            #root 부분에서 size 변수 찾기  
    file_name = root.find('filename').text              # filename 찾기 
    object_name = []
    bbox = []
    objects = root.findall('object')
    for _object in objects:
        name = _object.find('name').text
        object_name.append(name)
        bndbox = _object.find('bndbox')              #bounding box 
        one_bbox = []
        xmin = bndbox.find("xmin").text            # x좌표 왼쪽부분 
        one_bbox.append(int(float(xmin))) 
        ymin = bndbox.find("ymin").text            # y좌표 왼쪽부분 
        one_bbox.append(int(float(ymin)))
        xmax = bndbox.find("xmax").text
        one_bbox.append(int(float(xmax)))          # x좌표 오른쪽부분 
        ymax = bndbox.find("ymax").text
        one_bbox.append(int(float(ymax)))          # y좌표 오른쪽부분 
        bbox.append(one_bbox)
    return file_name, object_name, bbox

 

 

 

3) 이미지 내 박스 그리는 함수 

 

def makeBox(voc_im,bbox,objects):
    image = voc_im.copy()
    for i in range(len(objects)):                 # 이미지 내에 box 그리는 함수 
        cv2.rectangle(image,(int(bbox[i][0]),int(bbox[i][1])),(int(bbox[i][2]),int(bbox[i][3])),color = (0,255,0),thickness = 1)
        cv2.putText(image, objects[i], (int(bbox[i][0]), int(bbox[i][1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
    return image

 

 

4)  라벨 및 객체 정리 

xml_list = os.listdir("C:/Users/82102/Desktop/archive/annotations/")        # xml list directory
xml_list.sort()         # file name sort 

label_set = set() 

for i in range(len(xml_list)):
    xml_path = 'C:/Users/82102/Desktop/archive/annotations/' + str(xml_list[i])
    file_name, object_name, bbox = xml_parser(xml_path)
    for name in object_name:
        label_set.add(name)           # object name add (채우기)

label_set = sorted(list(label_set))      

label_dic = {}
for i, key in enumerate(label_set):
    label_dic[key] = (i+1)             # label 세 가지  1,2, 3

print(label_dic)

output 

{'mask_weared_incorrect': 1, 'with_mask': 2, 'without_mask': 3}

 

 

5)   사용할 데이터 셋 만들기 

class Pascal_Vo(Dataset):
    def __init__(self, xml_list, len_data):
        
        self.xml_list = xml_list
        self.len_data = len_data                      # 데이터셋 길이
        self.to_tensor = transforms.ToTensor() 
        self.flip = iaa.Fliplr(0.5)              # augmentation flip 사용 
        self.resize = iaa.Resize({'shorter-side': 600, "longer-side":"keep-aspect-ratio"})   # augmentation resize 사용 
        
    def __len__(self):
        return self.len_data
    
    def __getitem__(self, idx):
        
        xml_path = 'C:/Users/82102/Desktop/archive/annotations/' + str(xml_list[idx])
        
        file_name, object_name, bbox = xml_parser(xml_path)
        image_path = 'C:/Users/82102/Desktop/archive/images/'+str(file_name)
        image = Image.open(image_path).convert('RGB')
        image = np.array(image)
        
        image, bbox = self.flip(image = image, bounding_boxes = np.array([bbox]))
        image, bbox = self.resize(image=image, bounding_boxes = bbox)
        bbox = bbox.squeeze(0).tolist()
        image= self.to_tensor(image)
        
        targets=[]
        d = {}
        d['boxes'] = torch.tensor(bbox)
        d['labels'] = torch.tensor([label_dic[x] for x in object_name], dtype=torch.int64)
        targets.append(d)
        
        return image, targets

 

 

6) Faster RCNN  모델 정의 

 

backbone = torchvision.models.vgg16(pretrained=True).features[:-1]    # pretrained model 불러오기 
backbone_out = 512              # output size 
backbone.out_channels = backbone_out   

anchor_generator = torchvision.models.detection.rpn.AnchorGenerator(sizes=((128,256,512),),aspect_ratios=((0.5,1.0,2.0),))
# detection generator 만들기

resolution = 7
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],output_size=resolution,sampling_ratio=2)    # ROi 정의 

box_head = torchvision.models.detection.faster_rcnn.TwoMLPHead(in_channels = backbone_out*(resolution**2),representation_size=4096)
box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(4096,4) # class 의 개수

# faster rcnn model Linear 부분 + Predictor 부분 


model = torchvision.models.detection.FasterRCNN(backbone, num_classes=None,
                        min_size= 600, max_size = 1000,
                        rpn_anchor_generator= anchor_generator,
                        rpn_pre_nms_top_n_train=6000, rpn_pre_nms_top_n_test=6000,
                        rpn_post_nms_top_n_train= 2000, rpn_post_nms_top_n_test=300,
                        rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3,
                        rpn_batch_size_per_image= 256, rpn_positive_fraction=0.5,
                        box_roi_pool=roi_pooler, box_head= box_head, box_predictor = box_predictor,
                        box_score_thresh=0.05, box_nms_thresh=0.7, box_detections_per_img=300,
                        box_fg_iou_thresh=0.5, box_be_iou_thresh=0.5,
                        box_batch_size_per_image=128, box_positive_fraction=0.25
                    )                      # model 하이퍼 파리미터 부분 : output  size 바꾸면 다 바꿔야 됨 
for param in model.rpn.parameters():
    torch.nn.init.normal_(param, mean=0.0, std=0.01)     # normalize

for name, param in model.roi_heads.named_parameters():
    if "bbox_pred" in name:
        torch.nn.init.normal_(param, mean=0.0, std=0.001)
    elif "weight" in name:
        torch.nn.init.normal_(param, mean =0.0, std=0.01)
    if "bias" in name:
        torch.nn.init.zeros_(param)

 

 

7)  loss function 정의 

 

def Total_loss(loss):
    loss_objectness = loss['loss_objectness']
    loss_rpn_box_reg = loss['loss_rpn_box_reg']
    loss_classifier = loss['loss_classifier']
    loss_box_reg = loss['loss_box_reg']
    
    rpn_total = loss_objectness + 10*loss_rpn_box_reg  #(람다라고 보면된다)
    fast_rcnn_total = loss_classifier + 1*loss_box_reg
    
    total_loss = rpn_total + fast_rcnn_total
    
    return total_loss

 

8) Train 

 

total_epoch = 15

len_data = 25  # 한 에폭당 이미지 개수  

loss_sum = 0  

optimizer = torch.optim.SGD(params = model.parameters(), lr=0.001, momentum=0.9, weight_decay = 0.0005) # SGD optimizer 
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, total_epoch,eta_min=0.00001)  # scheduler 사용 

start_epoch = 0
start_idx = 0

print("start_epoch = {} , start_idx = {}".format(start_epoch,start_idx))

print("Training Start")
model.train()       # train 모드 
start = time.time()

for epoch in range(start_epoch, total_epoch):
    
    dataset = Pascal_Vo(xml_list[:len_data], len_data - start_idx)   # dataset 불러오기 
    dataloader = DataLoader(dataset, shuffle=True)    # dataloader 생성
    
    for i, (image, targets) in enumerate(dataloader, start_idx):   # index image target(label) 불러오기 
        
        optimizer.zero_grad()
        
        targets[0]['boxes'].squeeze_(0)
        targets[0]['labels'].squeeze_(0)
        
        loss = model(image,targets)   
        total_loss = Total_loss(loss)
        loss_sum += total_loss
        
        
        total_loss.backward()
        optimizer.step()
    
    start_idx = 0
    scheduler.step()
    
    state ={
        'epoch' : epoch,
        'iter' :i+1,
        'state_dict' :model.state_dict(),
        'optimizer':optimizer.state_dict(),
        'scheduler' : scheduler.state_dict()
    }
    print('epoch:' + str(epoch))
    print('loss:' + str(total_loss))

 

9)  Evaluation 

 

model.eval()
preds = model(image)

boxes=preds[0]['boxes']  #예측한 box 
labels = preds[0]['labels']   #예측한 labels
objects = []
for lb in labels:
      objects.append([k for k, v in label_dic.items() if v == lb][0])
      
plot_image = image.squeeze().permute(1,2,0).numpy()
answer = makeBox(plot_image,boxes,objects)   # image 내 box 그리고 object 인식 

plt.imshow(answer)

 

 

3. 느낀 점 

 

gpu를 사용하지 못해서 제대로 데이터를 훈련시키지 못한 것이 아쉬움이 남는다. 그리고 평가를 하는 과정에서 box가 여러개 생기는 문제가 발생했는데 이점을 고치지 못하여 ... 다시 도전해보고 싶은 모델이다. 

그리고 생각보다 torchvision안에 pretrained model이 많다는 것을 알 수 있었고  xml을 object detection의 dataset으로 많이 사용한다는 것 또한 배울 수 있었던 것 같다. 

관련글 더보기

댓글 영역