环境要求

我的版本是YOLOV5 7.0

先看结果:


结果仅供参考

具体步骤一:

首先配置好YOLO V5环境
这个采用pip install requirements即可
具体配置环境可以看我其他的博客有详细介绍
GPU环境自己配置

步骤二:

运行YOLO 没问题,输出结果:

步骤三

在项目文件夹下添加main_gradcam.py文件

main_gradcam.py

import osimport randomimport timeimport argparseimport numpy as npfrom models.gradcam import YOLOV5GradCAM, YOLOV5GradCAMPPfrom models.yolov5_object_detector import YOLOV5TorchObjectDetectorimport cv2# 数据集类别名names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']# class names# yolov5s网络中的三个detect层target_layers = ['model_17_cv3_act', 'model_20_cv3_act', 'model_23_cv3_act']# Argumentsparser = argparse.ArgumentParser()parser.add_argument('--model-path', type=str, default="yolov5s.pt", help='Path to the model')parser.add_argument('--img-path', type=str, default='data/images/bus.jpg', help='input image path')parser.add_argument('--output-dir', type=str, default='runs/result17', help='output dir')parser.add_argument('--img-size', type=int, default=640, help="input image size")parser.add_argument('--target-layer', type=str, default='model_17_cv3_act',help='The layer hierarchical address to which gradcam will applied,' ' the names should be separated by underline')parser.add_argument('--method', type=str, default='gradcam', help='gradcam method')parser.add_argument('--device', type=str, default='cuda', help='cuda or cpu')parser.add_argument('--no_text_box', action='store_true',help='do not show label and box on the heatmap')args = parser.parse_args()def get_res_img(bbox, mask, res_img):mask = mask.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy().astype(np.uint8)heatmap = cv2.applyColorMap(mask, cv2.COLORMAP_JET)# n_heatmat = (Box.fill_outer_box(heatmap, bbox) / 255).astype(np.float32)n_heatmat = (heatmap / 255).astype(np.float32)res_img = res_img / 255res_img = cv2.add(res_img, n_heatmat)res_img = (res_img / res_img.max())return res_img, n_heatmatdef plot_one_box(x, img, color=None, label=None, line_thickness=3):# this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!cv2.imwrite('temp.jpg', (img * 255).astype(np.uint8))img = cv2.imread('temp.jpg')# Plots one bounding box on image imgtl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1# line/font thicknesscolor = color or [random.randint(0, 255) for _ in range(3)]c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)if label:tf = max(tl - 1, 1)# font thicknesst_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]outside = c1[1] - t_size[1] - 3 >= 0# label fits outside box upc2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 if outside else c1[1] + t_size[1] + 3outsize_right = c2[0] - img.shape[:2][1] > 0# label fits outside box rightc1 = c1[0] - (c2[0] - img.shape[:2][1]) if outsize_right else c1[0], c1[1]c2 = c2[0] - (c2[0] - img.shape[:2][1]) if outsize_right else c2[0], c2[1]cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)# filledcv2.putText(img, label, (c1[0], c1[1] - 2 if outside else c2[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf,lineType=cv2.LINE_AA)return img# 检测单个图片def main(img_path):colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]device = args.deviceinput_size = (args.img_size, args.img_size)# 读入图片img = cv2.imread(img_path)# 读取图像格式:BGRprint('[INFO] Loading the model')# 实例化YOLOv5模型,得到检测结果model = YOLOV5TorchObjectDetector(args.model_path, device, img_size=input_size, names=names)# img[..., ::-1]: BGR --> RGB# (480, 640, 3) --> (1, 3, 480, 640)torch_img = model.preprocessing(img[..., ::-1])tic = time.time()# 遍历三层检测层for target_layer in target_layers:# 获取grad-cam方法if args.method == 'gradcam':saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)elif args.method == 'gradcampp':saliency_method = YOLOV5GradCAMPP(model=model, layer_name=target_layer, img_size=input_size)masks, logits, [boxes, _, class_names, conf] = saliency_method(torch_img)# 得到预测结果result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()result = result[..., ::-1]# convert to bgr# 保存设置imgae_name = os.path.basename(img_path)# 获取图片名save_path = f'{args.output_dir}{imgae_name[:-4]}/{args.method}'if not os.path.exists(save_path):os.makedirs(save_path)print(f'[INFO] Saving the final image at {save_path}')# 遍历每张图片中的每个目标for i, mask in enumerate(masks):# 遍历图片中的每个目标res_img = result.copy()# 获取目标的位置和类别信息bbox, cls_name = boxes[0][i], class_names[0][i]label = f'{cls_name} {conf[0][i]}'# 类别+置信分数# 获取目标的热力图res_img, heat_map = get_res_img(bbox, mask, res_img)res_img = plot_one_box(bbox, res_img, label=label, color=colors[int(names.index(cls_name))], line_thickness=3)# 缩放到原图片大小res_img = cv2.resize(res_img, dsize=(img.shape[:-1][::-1]))output_path = f'{save_path}/{target_layer[6:8]}_{i}.jpg'cv2.imwrite(output_path, res_img)print(f'{target_layer[6:8]}_{i}.jpg done!!')print(f'Total time : {round(time.time() - tic, 4)} s')if __name__ == '__main__':# 图片路径为文件夹if os.path.isdir(args.img_path):img_list = os.listdir(args.img_path)print(img_list)for item in img_list:# 依次获取文件夹中的图片名,组合成图片的路径main(os.path.join(args.img_path, item))# 单个图片else:main(args.img_path)

步骤四

在model文件夹下添加如下两个py文件,分别是gradcam.py和yolov5_object_detector.py

gradcam.py代码如下:

import timeimport torchimport torch.nn.functional as Fdef find_yolo_layer(model, layer_name):"""Find yolov5 layer to calculate GradCAM and GradCAM++Args:model: yolov5 model.layer_name (str): the name of layer with its hierarchical information.Return:target_layer: found layer"""hierarchy = layer_name.split('_')target_layer = model.model._modules[hierarchy[0]]for h in hierarchy[1:]:target_layer = target_layer._modules[h]return target_layerclass YOLOV5GradCAM:# 初始化,得到target_layer层def __init__(self, model, layer_name, img_size=(640, 640)):self.model = modelself.gradients = dict()self.activations = dict()def backward_hook(module, grad_input, grad_output):self.gradients['value'] = grad_output[0]return Nonedef forward_hook(module, input, output):self.activations['value'] = outputreturn Nonetarget_layer = find_yolo_layer(self.model, layer_name)# 获取forward过程中每层的输入和输出,用于对比hook是不是正确记录target_layer.register_forward_hook(forward_hook)target_layer.register_full_backward_hook(backward_hook)device = 'cuda' if next(self.model.model.parameters()).is_cuda else 'cpu'self.model(torch.zeros(1, 3, *img_size, device=device))def forward(self, input_img, class_idx=True):"""Args:input_img: input image with shape of (1, 3, H, W)Return:mask: saliency map of the same spatial dimension with inputlogit: model outputpreds: The object predictions"""saliency_maps = []b, c, h, w = input_img.size()preds, logits = self.model(input_img)for logit, cls, cls_name in zip(logits[0], preds[1][0], preds[2][0]):if class_idx:score = logit[cls]else:score = logit.max()self.model.zero_grad()tic = time.time()# 获取梯度score.backward(retain_graph=True)print(f"[INFO] {cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')gradients = self.gradients['value']activations = self.activations['value']b, k, u, v = gradients.size()alpha = gradients.view(b, k, -1).mean(2)weights = alpha.view(b, k, 1, 1)saliency_map = (weights * activations).sum(1, keepdim=True)saliency_map = F.relu(saliency_map)saliency_map = F.interpolate(saliency_map, size=(h, w), mode='bilinear', align_corners=False)saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).datasaliency_maps.append(saliency_map)return saliency_maps, logits, predsdef __call__(self, input_img):return self.forward(input_img)class YOLOV5GradCAMPP(YOLOV5GradCAM):def __init__(self, model, layer_name, img_size=(640, 640)):super(YOLOV5GradCAMPP, self).__init__(model, layer_name, img_size)def forward(self, input_img, class_idx=True):saliency_maps = []b, c, h, w = input_img.size()tic = time.time()preds, logits = self.model(input_img)print("[INFO] model-forward took: ", round(time.time() - tic, 4), 'seconds')for logit, cls, cls_name in zip(logits[0], preds[1][0], preds[2][0]):if class_idx:score = logit[cls]else:score = logit.max()self.model.zero_grad()tic = time.time()# 获取梯度score.backward(retain_graph=True)print(f"[INFO] {cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')gradients = self.gradients['value']# dS/dAactivations = self.activations['value']# Ab, k, u, v = gradients.size()alpha_num = gradients.pow(2)alpha_denom = gradients.pow(2).mul(2) + \activations.mul(gradients.pow(3)).view(b, k, u * v).sum(-1, keepdim=True).view(b, k, 1, 1)# torch.where(condition, x, y) condition是条件,满足条件就返回x,不满足就返回yalpha_denom = torch.where(alpha_denom != 0.0, alpha_denom, torch.ones_like(alpha_denom))alpha = alpha_num.div(alpha_denom + 1e-7)positive_gradients = F.relu(score.exp() * gradients)# ReLU(dY/dA) == ReLU(exp(S)*dS/dA))weights = (alpha * positive_gradients).view(b, k, u * v).sum(-1).view(b, k, 1, 1)saliency_map = (weights * activations).sum(1, keepdim=True)saliency_map = F.relu(saliency_map)saliency_map = F.interpolate(saliency_map, size=(h, w), mode='bilinear', align_corners=False)saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).datasaliency_maps.append(saliency_map)return saliency_maps, logits, preds

yolov5_object_detector.py的代码如下:

import numpy as npimport torchfrom models.experimental import attempt_loadfrom utils.general import xywh2xyxyfrom utils.dataloaders import letterboximport cv2import timeimport torchvisionimport torch.nn as nnfrom utils.metrics import box_iouclass YOLOV5TorchObjectDetector(nn.Module):def __init__(self, model_weight, device, img_size, names=None, mode='eval', confidence=0.45, iou_thresh=0.45, agnostic_nms=False):super(YOLOV5TorchObjectDetector, self).__init__()self.device = deviceself.model = Noneself.img_size = img_sizeself.mode = modeself.confidence = confidenceself.iou_thresh = iou_threshself.agnostic = agnostic_nmsself.model = attempt_load(model_weight, inplace=False, fuse=False)self.model.requires_grad_(True)self.model.to(device)if self.mode == 'train':self.model.train()else:self.model.eval()# fetch the namesif names is None:self.names = ['your dataset classname']else:self.names = names# preventing cold startimg = torch.zeros((1, 3, *self.img_size), device=device)self.model(img)@staticmethoddef non_max_suppression(prediction, logits, conf_thres=0.3, iou_thres=0.45, classes=None, agnostic=False,multi_label=False, labels=(), max_det=300):"""Runs Non-Maximum Suppression (NMS) on inference and logits resultsReturns: list of detections, on (n,6) tensor per image [xyxy, conf, cls] and pruned input logits (n, number-classes)"""nc = prediction.shape[2] - 5# number of classesxc = prediction[..., 4] > conf_thres# candidates# Checksassert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'# Settingsmin_wh, max_wh = 2, 4096# (pixels) minimum and maximum box width and heightmax_nms = 30000# maximum number of boxes into torchvision.ops.nms()time_limit = 10.0# seconds to quit afterredundant = True# require redundant detectionsmulti_label &= nc > 1# multiple labels per box (adds 0.5ms/img)merge = False# use merge-NMSt = time.time()output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]logits_output = [torch.zeros((0, nc), device=logits.device)] * logits.shape[0]# logits_output = [torch.zeros((0, 80), device=logits.device)] * logits.shape[0]for xi, (x, log_) in enumerate(zip(prediction, logits)):# image index, image inference# Apply constraints# x[((x[..., 2:4]  max_wh)).any(1), 4] = 0# width-heightx = x[xc[xi]]# confidencelog_ = log_[xc[xi]]# Cat apriori labels if autolabellingif labels and len(labels[xi]):l = labels[xi]v = torch.zeros((len(l), nc + 5), device=x.device)v[:, :4] = l[:, 1:5]# boxv[:, 4] = 1.0# confv[range(len(l)), l[:, 0].long() + 5] = 1.0# clsx = torch.cat((x, v), 0)# If none remain process next imageif not x.shape[0]:continue# Compute confx[:, 5:] *= x[:, 4:5]# conf = obj_conf * cls_conf# Box (center x, center y, width, height) to (x1, y1, x2, y2)box = xywh2xyxy(x[:, :4])# Detections matrix nx6 (xyxy, conf, cls)if multi_label:i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).Tx = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)else:# best class onlyconf, j = x[:, 5:].max(1, keepdim=True)x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]log_ = log_[conf.view(-1) > conf_thres]# Filter by classif classes is not None:x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]# Check shapen = x.shape[0]# number of boxesif not n:# no boxescontinueelif n > max_nms:# excess boxesx = x[x[:, 4].argsort(descending=True)[:max_nms]]# sort by confidence# Batched NMSc = x[:, 5:6] * (0 if agnostic else max_wh)# classesboxes, scores = x[:, :4] + c, x[:, 4]# boxes (offset by class), scoresi = torchvision.ops.nms(boxes, scores, iou_thres)# NMSif i.shape[0] > max_det:# limit detectionsi = i[:max_det]if merge and (1 < n < 3E3):# Merge NMS (boxes merged using weighted mean)# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)iou = box_iou(boxes[i], boxes) > iou_thres# iou matrixweights = iou * scores[None]# box weightsx[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)# merged boxesif redundant:i = i[iou.sum(1) > 1]# require redundancyoutput[xi] = x[i]logits_output[xi] = log_[i]assert log_[i].shape[0] == x[i].shape[0]if (time.time() - t) > time_limit:print(f'WARNING: NMS time limit {time_limit}s exceeded')break# time limit exceededreturn output, logits_output@staticmethoddef yolo_resize(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):return letterbox(img, new_shape=new_shape, color=color, auto=auto, scaleFill=scaleFill, scaleup=scaleup)def forward(self, img):prediction, logits, _ = self.model(img, augment=False)prediction, logits = self.non_max_suppression(prediction, logits, self.confidence, self.iou_thresh,classes=None,agnostic=self.agnostic)self.boxes, self.class_names, self.classes, self.confidences = [[[] for _ in range(img.shape[0])] for _ inrange(4)]for i, det in enumerate(prediction):# detections per imageif len(det):for *xyxy, conf, cls in det:# 返回整数bbox = [int(b) for b in xyxy]self.boxes[i].append(bbox)self.confidences[i].append(round(conf.item(), 2))cls = int(cls.item())self.classes[i].append(cls)if self.names is not None:self.class_names[i].append(self.names[cls])else:self.class_names[i].append(cls)return [self.boxes, self.classes, self.class_names, self.confidences], logitsdef preprocessing(self, img):if len(img.shape) != 4:img = np.expand_dims(img, axis=0)im0 = img.astype(np.uint8)img = np.array([self.yolo_resize(im, new_shape=self.img_size)[0] for im in im0])img = img.transpose((0, 3, 1, 2))img = np.ascontiguousarray(img)img = torch.from_numpy(img).to(self.device)img = img / 255.0return img

步骤五

更改model/yolo.py

具体而言
Detect类中的forward函数

def forward(self, x):z = []# inference outputlogits_ = [] # 修改---1for i in range(self.nl):x[i] = self.m[i](x[i])# convbs, _, ny, nx = x[i].shape# x(bs,255,20,20) to x(bs,3,20,20,85)x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()if not self.training:# inferenceif self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)logits = x[i][..., 5:] # 修改---2if isinstance(self, Segment):# (boxes + masks)xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4)xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i]# xywh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i]# why = torch.cat((xy, wh, conf.sigmoid(), mask), 4)else:# Detect (boxes only)xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)xy = (xy * 2 + self.grid[i]) * self.stride[i]# xywh = (wh * 2) ** 2 * self.anchor_grid[i]# why = torch.cat((xy, wh, conf), 4)z.append(y.view(bs, self.na * nx * ny, self.no))logits_.append(logits.view(bs, -1, self.no - 5)) # 修改---3# return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)return x if self.training else (torch.cat(z, 1), torch.cat(logits_, 1), x)# 修改---4

为了防止大家不知道怎么修改yolo.py文件,我将修改后的yolo.py文件放在下方
yolo.py

# YOLOv5by Ultralytics, GPL-3.0 license"""YOLO-specific modulesUsage:$ python models/yolo.py --cfg yolov5s.yaml"""import argparseimport contextlibimport osimport platformimport sysfrom copy import deepcopyfrom pathlib import PathFILE = Path(__file__).resolve()ROOT = FILE.parents[1]# YOLOv5 root directoryif str(ROOT) not in sys.path:sys.path.append(str(ROOT))# add ROOT to PATHif platform.system() != 'Windows':ROOT = Path(os.path.relpath(ROOT, Path.cwd()))# relativefrom models.common import *from models.experimental import *from utils.autoanchor import check_anchor_orderfrom utils.general import LOGGER, check_version, check_yaml, make_divisible, print_argsfrom utils.plots import feature_visualizationfrom utils.torch_utils import (fuse_conv_and_bn, initialize_weights, model_info, profile, scale_img, select_device, time_sync)try:import thop# for FLOPs computationexcept ImportError:thop = Noneclass Detect(nn.Module):# YOLOv5 Detect head for detection modelsstride = None# strides computed during builddynamic = False# force grid reconstructionexport = False# export modedef __init__(self, nc=80, anchors=(), ch=(), inplace=True):# detection layersuper().__init__()self.nc = nc# number of classesself.no = nc + 5# number of outputs per anchorself.nl = len(anchors)# number of detection layersself.na = len(anchors[0]) // 2# number of anchorsself.grid = [torch.empty(0) for _ in range(self.nl)]# init gridself.anchor_grid = [torch.empty(0) for _ in range(self.nl)]# init anchor gridself.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2))# shape(nl,na,2)self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)# output convself.inplace = inplace# use inplace ops (e.g. slice assignment)def forward(self, x):z = []# inference outputlogits_ = [] # 修改---1for i in range(self.nl):x[i] = self.m[i](x[i])# convbs, _, ny, nx = x[i].shape# x(bs,255,20,20) to x(bs,3,20,20,85)x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()if not self.training:# inferenceif self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)logits = x[i][..., 5:] # 修改---2if isinstance(self, Segment):# (boxes + masks)xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4)xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i]# xywh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i]# why = torch.cat((xy, wh, conf.sigmoid(), mask), 4)else:# Detect (boxes only)xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)xy = (xy * 2 + self.grid[i]) * self.stride[i]# xywh = (wh * 2) ** 2 * self.anchor_grid[i]# why = torch.cat((xy, wh, conf), 4)z.append(y.view(bs, self.na * nx * ny, self.no))logits_.append(logits.view(bs, -1, self.no - 5)) # 修改---3# return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)return x if self.training else (torch.cat(z, 1), torch.cat(logits_, 1), x)# 修改---4def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')):d = self.anchors[i].devicet = self.anchors[i].dtypeshape = 1, self.na, ny, nx, 2# grid shapey, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t)yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x)# torch>=0.7 compatibilitygrid = torch.stack((xv, yv), 2).expand(shape) - 0.5# add grid offset, i.e. y = 2.0 * x - 0.5anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape)return grid, anchor_gridclass Segment(Detect):# YOLOv5 Segment head for segmentation modelsdef __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True):super().__init__(nc, anchors, ch, inplace)self.nm = nm# number of masksself.npr = npr# number of protosself.no = 5 + nc + self.nm# number of outputs per anchorself.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)# output convself.proto = Proto(ch[0], self.npr, self.nm)# protosself.detect = Detect.forwarddef forward(self, x):p = self.proto(x[0])x = self.detect(self, x)return (x, p) if self.training else (x[0], p) if self.export else (x[0], p, x[1])class BaseModel(nn.Module):# YOLOv5 base modeldef forward(self, x, profile=False, visualize=False):return self._forward_once(x, profile, visualize)# single-scale inference, traindef _forward_once(self, x, profile=False, visualize=False):y, dt = [], []# outputsfor m in self.model:if m.f != -1:# if not from previous layerx = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]# from earlier layersif profile:self._profile_one_layer(m, x, dt)x = m(x)# runy.append(x if m.i in self.save else None)# save outputif visualize:feature_visualization(x, m.type, m.i, save_dir=visualize)return xdef _profile_one_layer(self, m, x, dt):c = m == self.model[-1]# is final layer, copy input as inplace fixo = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1E9 * 2 if thop else 0# FLOPst = time_sync()for _ in range(10):m(x.copy() if c else x)dt.append((time_sync() - t) * 100)if m == self.model[0]:LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s}module")LOGGER.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f}{m.type}')if c:LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}Total")def fuse(self):# fuse model Conv2d() + BatchNorm2d() layersLOGGER.info('Fusing layers... ')for m in self.model.modules():if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'):m.conv = fuse_conv_and_bn(m.conv, m.bn)# update convdelattr(m, 'bn')# remove batchnormm.forward = m.forward_fuse# update forwardself.info()return selfdef info(self, verbose=False, img_size=640):# print model informationmodel_info(self, verbose, img_size)def _apply(self, fn):# Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffersself = super()._apply(fn)m = self.model[-1]# Detect()if isinstance(m, (Detect, Segment)):m.stride = fn(m.stride)m.grid = list(map(fn, m.grid))if isinstance(m.anchor_grid, list):m.anchor_grid = list(map(fn, m.anchor_grid))return selfclass DetectionModel(BaseModel):# YOLOv5 detection modeldef __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):# model, input channels, number of classessuper().__init__()if isinstance(cfg, dict):self.yaml = cfg# model dictelse:# is *.yamlimport yaml# for torch hubself.yaml_file = Path(cfg).namewith open(cfg, encoding='ascii', errors='ignore') as f:self.yaml = yaml.safe_load(f)# model dict# Define modelch = self.yaml['ch'] = self.yaml.get('ch', ch)# input channelsif nc and nc != self.yaml['nc']:LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")self.yaml['nc'] = nc# override yaml valueif anchors:LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}')self.yaml['anchors'] = round(anchors)# override yaml valueself.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])# model, savelistself.names = [str(i) for i in range(self.yaml['nc'])]# default namesself.inplace = self.yaml.get('inplace', True)# Build strides, anchorsm = self.model[-1]# Detect()if isinstance(m, (Detect, Segment)):s = 256# 2x min stridem.inplace = self.inplaceforward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])# forwardcheck_anchor_order(m)m.anchors /= m.stride.view(-1, 1, 1)self.stride = m.strideself._initialize_biases()# only run once# Init weights, biasesinitialize_weights(self)self.info()LOGGER.info('')def forward(self, x, augment=False, profile=False, visualize=False):if augment:return self._forward_augment(x)# augmented inference, Nonereturn self._forward_once(x, profile, visualize)# single-scale inference, traindef _forward_augment(self, x):img_size = x.shape[-2:]# height, widths = [1, 0.83, 0.67]# scalesf = [None, 3, None]# flips (2-ud, 3-lr)y = []# outputsfor si, fi in zip(s, f):xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))yi = self._forward_once(xi)[0]# forward# cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])# saveyi = self._descale_pred(yi, fi, si, img_size)y.append(yi)y = self._clip_augmented(y)# clip augmented tailsreturn torch.cat(y, 1), None# augmented inference, traindef _descale_pred(self, p, flips, scale, img_size):# de-scale predictions following augmented inference (inverse operation)if self.inplace:p[..., :4] /= scale# de-scaleif flips == 2:p[..., 1] = img_size[0] - p[..., 1]# de-flip udelif flips == 3:p[..., 0] = img_size[1] - p[..., 0]# de-flip lrelse:x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale# de-scaleif flips == 2:y = img_size[0] - y# de-flip udelif flips == 3:x = img_size[1] - x# de-flip lrp = torch.cat((x, y, wh, p[..., 4:]), -1)return pdef _clip_augmented(self, y):# Clip YOLOv5 augmented inference tailsnl = self.model[-1].nl# number of detection layers (P3-P5)g = sum(4 ** x for x in range(nl))# grid pointse = 1# exclude layer counti = (y[0].shape[1] // g) * sum(4 ** x for x in range(e))# indicesy[0] = y[0][:, :-i]# largei = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))# indicesy[-1] = y[-1][:, i:]# smallreturn ydef _initialize_biases(self, cf=None):# initialize biases into Detect(), cf is class frequency# https://arxiv.org/abs/1708.02002 section 3.3# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.m = self.model[-1]# Detect() modulefor mi, s in zip(m.m, m.stride):# fromb = mi.bias.view(m.na, -1)# conv.bias(255) to (3,85)b.data[:, 4] += math.log(8 / (640 / s) ** 2)# obj (8 objects per 640 image)b.data[:, 5:5 + m.nc] += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum())# clsmi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)Model = DetectionModel# retain YOLOv5 'Model' class for backwards compatibilityclass SegmentationModel(DetectionModel):# YOLOv5 segmentation modeldef __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None, anchors=None):super().__init__(cfg, ch, nc, anchors)class ClassificationModel(BaseModel):# YOLOv5 classification modeldef __init__(self, cfg=None, model=None, nc=1000, cutoff=10):# yaml, model, number of classes, cutoff indexsuper().__init__()self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)def _from_detection_model(self, model, nc=1000, cutoff=10):# Create a YOLOv5 classification model from a YOLOv5 detection modelif isinstance(model, DetectMultiBackend):model = model.model# unwrap DetectMultiBackendmodel.model = model.model[:cutoff]# backbonem = model.model[-1]# last layerch = m.conv.in_channels if hasattr(m, 'conv') else m.cv1.conv.in_channels# ch into modulec = Classify(ch, nc)# Classify()c.i, c.f, c.type = m.i, m.f, 'models.common.Classify'# index, from, typemodel.model[-1] = c# replaceself.model = model.modelself.stride = model.strideself.save = []self.nc = ncdef _from_yaml(self, cfg):# Create a YOLOv5 classification model from a *.yaml fileself.model = Nonedef parse_model(d, ch):# model_dict, input_channels(3)# Parse a YOLOv5 model.yaml dictionaryLOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}{'module':<40}{'arguments':<30}")anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation')if act:Conv.default_act = eval(act)# redefine default activation, i.e. Conv.default_act = nn.SiLU()LOGGER.info(f"{colorstr('activation:')} {act}")# printna = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors# number of anchorsno = na * (nc + 5)# number of outputs = anchors * (classes + 5)layers, save, c2 = [], [], ch[-1]# layers, savelist, ch outfor i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):# from, number, module, argsm = eval(m) if isinstance(m, str) else m# eval stringsfor j, a in enumerate(args):with contextlib.suppress(NameError):args[j] = eval(a) if isinstance(a, str) else a# eval stringsn = n_ = max(round(n * gd), 1) if n > 1 else n# depth gainif m in {Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x}:c1, c2 = ch[f], args[0]if c2 != no:# if not outputc2 = make_divisible(c2 * gw, 8)args = [c1, c2, *args[1:]]if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}:args.insert(2, n)# number of repeatsn = 1elif m is nn.BatchNorm2d:args = [ch[f]]elif m is Concat:c2 = sum(ch[x] for x in f)# TODO: channel, gw, gdelif m in {Detect, Segment}:args.append([ch[x] for x in f])if isinstance(args[1], int):# number of anchorsargs[1] = [list(range(args[1] * 2))] * len(f)if m is Segment:args[3] = make_divisible(args[3] * gw, 8)elif m is Contract:c2 = ch[f] * args[0] ** 2elif m is Expand:c2 = ch[f] // args[0] ** 2else:c2 = ch[f]m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)# modulet = str(m)[8:-2].replace('__main__.', '')# module typenp = sum(x.numel() for x in m_.parameters())# number paramsm_.i, m_.f, m_.type, m_.np = i, f, t, np# attach index, 'from' index, type, number paramsLOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f}{t:<40}{str(args):<30}')# printsave.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)# append to savelistlayers.append(m_)if i == 0:ch = []ch.append(c2)return nn.Sequential(*layers), sorted(save)if __name__ == '__main__':parser = argparse.ArgumentParser()parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml')parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs')parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')parser.add_argument('--profile', action='store_true', help='profile model speed')parser.add_argument('--line-profile', action='store_true', help='profile model speed layer by layer')parser.add_argument('--test', action='store_true', help='test all yolo*.yaml')opt = parser.parse_args()opt.cfg = check_yaml(opt.cfg)# check YAMLprint_args(vars(opt))device = select_device(opt.device)# Create modelim = torch.rand(opt.batch_size, 3, 640, 640).to(device)model = Model(opt.cfg).to(device)# Optionsif opt.line_profile:# profile layer by layermodel(im, profile=True)elif opt.profile:# profile forward-backwardresults = profile(input=im, ops=[model], n=3)elif opt.test:# test all modelsfor cfg in Path(ROOT / 'models').rglob('yolo*.yaml'):try:_ = Model(cfg)except Exception as e:print(f'Error in {cfg}: {e}')else:# report fused model summarymodel.fuse()

步骤六:

运行main_gradcam.py
参数列表可以自己进行修改。

# Argumentsparser = argparse.ArgumentParser()parser.add_argument('--model-path', type=str, default="yolov5s.pt", help='Path to the model')parser.add_argument('--img-path', type=str, default='data/images/bus.jpg', help='input image path')parser.add_argument('--output-dir', type=str, default='runs/result17', help='output dir')parser.add_argument('--img-size', type=int, default=640, help="input image size")parser.add_argument('--target-layer', type=str, default='model_17_cv3_act',help='The layer hierarchical address to which gradcam will applied,' ' the names should be separated by underline')parser.add_argument('--method', type=str, default='gradcam', help='gradcam method')parser.add_argument('--device', type=str, default='cuda', help='cuda or cpu')parser.add_argument('--no_text_box', action='store_true',help='do not show label and box on the heatmap')args = parser.parse_args()

完成