Skip to Content
AlgorithmCvYOLO 系列小目标检测优化完全指南

YOLO 系列小目标检测优化完全指南

小目标检测一直是目标检测领域的难点,本文系统整理了 YOLO 系列(v3-v10)针对小目标的优化策略,从模型结构、训练技巧到工程实践,提供完整的解决方案。

小目标检测为什么难?

技术挑战

  1. 特征表达不足:经过多次下采样后,小目标的特征几乎消失
  2. 感受野不匹配:深层网络的大感受野不适合小目标
  3. 正负样本不均衡:小目标的正样本少,训练困难
  4. 定位精度要求高:几个像素的偏差对小目标影响巨大

实际场景挑战

  • 医疗影像:细胞、细菌等微小病灶
  • 无人机航拍:高空视角下的行人、车辆
  • 卫星遥感:建筑物、船只等目标
  • 工业质检:PCB 板上的微小缺陷

一、模型结构改进

1. 多尺度特征融合

FPN 到 PANet 的演进

# YOLOv3 的 FPN 结构 class FPN(nn.Module): def __init__(self, in_channels_list, out_channels): super().__init__() self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for in_channels in in_channels_list: self.lateral_convs.append( nn.Conv2d(in_channels, out_channels, 1) ) self.fpn_convs.append( nn.Conv2d(out_channels, out_channels, 3, padding=1) ) def forward(self, features): # features: [P3, P4, P5] from backbone laterals = [conv(f) for conv, f in zip(self.lateral_convs, features)] # 自顶向下融合 for i in range(len(laterals) - 1, 0, -1): laterals[i - 1] += F.interpolate( laterals[i], scale_factor=2, mode='nearest' ) # 输出特征 outs = [conv(lat) for conv, lat in zip(self.fpn_convs, laterals)] return outs
# YOLOv4/v5 的 PANet 结构 class PANet(nn.Module): def __init__(self): super().__init__() # 自顶向下路径(FPN) self.fpn_p5_to_p4 = nn.Conv2d(512, 256, 1) self.fpn_p4_to_p3 = nn.Conv2d(256, 128, 1) # 自底向上路径(PAN) self.pan_p3_to_p4 = nn.Conv2d(128, 256, 3, stride=2, padding=1) self.pan_p4_to_p5 = nn.Conv2d(256, 512, 3, stride=2, padding=1) def forward(self, features): p3, p4, p5 = features # FPN 自顶向下 p5_up = F.interpolate(self.fpn_p5_to_p4(p5), scale_factor=2) p4 = p4 + p5_up p4_up = F.interpolate(self.fpn_p4_to_p3(p4), scale_factor=2) p3 = p3 + p4_up # PAN 自底向上 p3_down = self.pan_p3_to_p4(p3) p4 = p4 + p3_down p4_down = self.pan_p4_to_p5(p4) p5 = p5 + p4_down return [p3, p4, p5] # 多尺度输出

增加更浅层特征(P2 检测层)

# 添加 P2 检测层配置 class YOLOv5_P2(nn.Module): def __init__(self): super().__init__() # 原有的 P3, P4, P5 self.detect_p3 = DetectHead(128, num_classes=80) # stride=8 self.detect_p4 = DetectHead(256, num_classes=80) # stride=16 self.detect_p5 = DetectHead(512, num_classes=80) # stride=32 # 新增 P2 检测层 self.detect_p2 = DetectHead(64, num_classes=80) # stride=4 # P2 特征提取 self.p2_conv = nn.Sequential( nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True) ) def forward(self, x): # x 包含来自 backbone 的 [P2, P3, P4, P5] 特征 p2, p3, p4, p5 = x # P2 特征增强 p2 = self.p2_conv(p2) # 多尺度检测 outputs = [] outputs.append(self.detect_p2(p2)) # 160×160 用于极小目标 outputs.append(self.detect_p3(p3)) # 80×80 用于小目标 outputs.append(self.detect_p4(p4)) # 40×40 用于中目标 outputs.append(self.detect_p5(p5)) # 20×20 用于大目标 return outputs

2. 更高分辨率输入

# 动态调整输入分辨率 class MultiScaleTraining: def __init__(self, base_size=640, scales=[0.5, 0.75, 1.0, 1.25, 1.5]): self.base_size = base_size self.scales = scales def get_size(self, epoch): # 渐进式提高分辨率 if epoch < 50: return 640 elif epoch < 100: return 896 elif epoch < 150: return 1024 else: # 随机多尺度 scale = random.choice(self.scales) return int(self.base_size * scale) # 使用示例 def train_with_dynamic_resolution(model, dataloader, epochs): scale_scheduler = MultiScaleTraining() for epoch in range(epochs): input_size = scale_scheduler.get_size(epoch) # 调整数据加载器的输入尺寸 dataloader.dataset.img_size = input_size for images, targets in dataloader: # 根据新尺寸调整图像 images = F.interpolate(images, size=(input_size, input_size)) # 训练步骤 outputs = model(images) loss = compute_loss(outputs, targets) loss.backward()

3. Anchor 优化(YOLOv3-v7)

import numpy as np from sklearn.cluster import KMeans def optimize_anchors_for_small_objects(dataset, num_anchors=9, img_size=640): """ 针对小目标数据集优化 anchor """ # 收集所有边界框 boxes = [] for _, targets in dataset: for target in targets: w, h = target[3] * img_size, target[4] * img_size boxes.append([w, h]) boxes = np.array(boxes) # 过滤小目标(例如面积 < 32×32) small_boxes = boxes[boxes[:, 0] * boxes[:, 1] < 32 * 32] # K-means 聚类 kmeans = KMeans(n_clusters=num_anchors, random_state=0) kmeans.fit(small_boxes) # 获取聚类中心作为 anchor anchors = kmeans.cluster_centers_ # 按面积排序 areas = anchors[:, 0] * anchors[:, 1] sorted_idx = np.argsort(areas) anchors = anchors[sorted_idx] print("优化后的小目标 anchors:") for i, (w, h) in enumerate(anchors): print(f" Anchor {i+1}: {w:.1f} x {h:.1f}") return anchors # 使用 IoU 距离的改进版本 def kmeans_iou(boxes, num_anchors): """ 使用 IoU 作为距离度量的 K-means """ def iou_distance(box, anchor): # 计算 IoU(假设都从原点开始) intersection = min(box[0], anchor[0]) * min(box[1], anchor[1]) union = box[0] * box[1] + anchor[0] * anchor[1] - intersection return 1 - intersection / union # 初始化 anchors anchors = boxes[np.random.choice(len(boxes), num_anchors, replace=False)] for _ in range(100): # 迭代次数 # 分配每个 box 到最近的 anchor distances = np.array([ [iou_distance(box, anchor) for anchor in anchors] for box in boxes ]) assignments = np.argmin(distances, axis=1) # 更新 anchors new_anchors = [] for i in range(num_anchors): cluster_boxes = boxes[assignments == i] if len(cluster_boxes) > 0: # 使用中位数而非平均值,更鲁棒 new_anchors.append(np.median(cluster_boxes, axis=0)) else: new_anchors.append(anchors[i]) anchors = np.array(new_anchors) return anchors

4. 改进的检测头

# Decoupled Head(解耦头)- YOLOv6/v8 采用 class DecoupledHead(nn.Module): def __init__(self, in_channels, num_classes): super().__init__() # 分类分支 self.cls_conv = nn.Sequential( nn.Conv2d(in_channels, in_channels, 3, padding=1), nn.BatchNorm2d(in_channels), nn.ReLU(inplace=True), nn.Conv2d(in_channels, in_channels, 3, padding=1), nn.BatchNorm2d(in_channels), nn.ReLU(inplace=True) ) self.cls_pred = nn.Conv2d(in_channels, num_classes, 1) # 回归分支 self.reg_conv = nn.Sequential( nn.Conv2d(in_channels, in_channels, 3, padding=1), nn.BatchNorm2d(in_channels), nn.ReLU(inplace=True), nn.Conv2d(in_channels, in_channels, 3, padding=1), nn.BatchNorm2d(in_channels), nn.ReLU(inplace=True) ) self.reg_pred = nn.Conv2d(in_channels, 4, 1) # 边界框回归 # IoU 分支(可选) self.iou_pred = nn.Conv2d(in_channels, 1, 1) # IoU 预测 def forward(self, x): # 分类 cls_feat = self.cls_conv(x) cls_output = self.cls_pred(cls_feat) # 回归 reg_feat = self.reg_conv(x) reg_output = self.reg_pred(reg_feat) iou_output = self.iou_pred(reg_feat) return cls_output, reg_output, iou_output # RepPoints 风格的可变形检测 class DeformableHead(nn.Module): def __init__(self, in_channels, num_classes): super().__init__() self.offset_conv = nn.Conv2d(in_channels, 18, 3, padding=1) # 9 个采样点的偏移 self.deform_conv = DeformConv2d(in_channels, in_channels, 3, padding=1) self.cls_conv = nn.Conv2d(in_channels, num_classes, 1) self.reg_conv = nn.Conv2d(in_channels, 4, 1) def forward(self, x): # 学习采样点偏移 offsets = self.offset_conv(x) # 可变形卷积 x = self.deform_conv(x, offsets) # 预测 cls = self.cls_conv(x) reg = self.reg_conv(x) return cls, reg

二、训练技巧

1. 数据增强策略

Mosaic 数据增强

class MosaicAugmentation: def __init__(self, img_size=640, prob=1.0): self.img_size = img_size self.prob = prob def __call__(self, images, targets): if random.random() > self.prob: return images[0], targets[0] # 随机选择 4 张图片 if len(images) < 4: images = images * 4 targets = targets * 4 indices = random.sample(range(len(images)), 4) imgs = [images[i] for i in indices] labels = [targets[i] for i in indices] # 创建 2x2 拼接 h, w = self.img_size // 2, self.img_size // 2 # 初始化大图 mosaic_img = np.zeros((self.img_size, self.img_size, 3), dtype=np.float32) mosaic_labels = [] # 随机中心点 cx = random.randint(w // 2, w * 3 // 2) cy = random.randint(h // 2, h * 3 // 2) for i, (img, label) in enumerate(zip(imgs, labels)): # 调整图片大小 img = cv2.resize(img, (w, h)) # 确定放置位置 if i == 0: # 左上 x1, y1, x2, y2 = 0, 0, cx, cy sx1, sy1, sx2, sy2 = w - cx, h - cy, w, h elif i == 1: # 右上 x1, y1, x2, y2 = cx, 0, self.img_size, cy sx1, sy1, sx2, sy2 = 0, h - cy, w - cx, h elif i == 2: # 左下 x1, y1, x2, y2 = 0, cy, cx, self.img_size sx1, sy1, sx2, sy2 = w - cx, 0, w, h - cy else: # 右下 x1, y1, x2, y2 = cx, cy, self.img_size, self.img_size sx1, sy1, sx2, sy2 = 0, 0, w - cx, h - cy # 放置图片片段 mosaic_img[y1:y2, x1:x2] = img[sy1:sy2, sx1:sx2] # 调整标签 for cls, x, y, w, h in label: # 转换到 mosaic 坐标 x = (x * (sx2 - sx1) + x1) / self.img_size y = (y * (sy2 - sy1) + y1) / self.img_size w = w * (sx2 - sx1) / self.img_size h = h * (sy2 - sy1) / self.img_size # 裁剪到图像边界 x = np.clip(x, 0, 1) y = np.clip(y, 0, 1) w = min(w, 1 - x) h = min(h, 1 - y) if w > 0 and h > 0: mosaic_labels.append([cls, x, y, w, h]) return mosaic_img, np.array(mosaic_labels)

Copy-Paste 增强

class CopyPasteAugmentation: def __init__(self, prob=0.5, max_objects=30): self.prob = prob self.max_objects = max_objects def __call__(self, image, labels, paste_objects): """ paste_objects: 预先裁剪好的小目标库 """ if random.random() > self.prob: return image, labels h, w = image.shape[:2] new_labels = labels.copy() # 随机选择要粘贴的小目标 n_paste = random.randint(1, min(self.max_objects, len(paste_objects))) selected = random.sample(paste_objects, n_paste) for obj in selected: obj_img, obj_mask, obj_label = obj oh, ow = obj_img.shape[:2] # 随机位置(避免边界) max_x = w - ow max_y = h - oh if max_x <= 0 or max_y <= 0: continue x = random.randint(0, max_x) y = random.randint(0, max_y) # 检查重叠(可选) overlap = self.check_overlap(new_labels, x, y, ow, oh) if overlap > 0.3: # 重叠阈值 continue # 粘贴对象 image[y:y+oh, x:x+ow] = \ image[y:y+oh, x:x+ow] * (1 - obj_mask) + obj_img * obj_mask # 添加标签 cx = (x + ow/2) / w cy = (y + oh/2) / h bw = ow / w bh = oh / h new_labels = np.vstack([new_labels, [obj_label, cx, cy, bw, bh]]) return image, new_labels def check_overlap(self, labels, x, y, w, h): # 检查与现有目标的重叠度 # 实现略... return 0.0

2. 小目标过采样

from torch.utils.data import WeightedRandomSampler def create_weighted_sampler(dataset, size_threshold=32*32): """ 为包含小目标的图像增加采样权重 """ weights = [] for idx in range(len(dataset)): _, targets = dataset[idx] # 计算小目标数量 small_obj_count = 0 for target in targets: # 假设 target 格式为 [class, x, y, w, h] area = target[3] * target[4] * dataset.img_size ** 2 if area < size_threshold: small_obj_count += 1 # 权重与小目标数量成正比 weight = 1.0 + small_obj_count * 0.5 # 可调整系数 weights.append(weight) # 创建采样器 sampler = WeightedRandomSampler( weights=weights, num_samples=len(dataset), replacement=True ) return sampler # 使用示例 train_sampler = create_weighted_sampler(train_dataset) train_loader = DataLoader( train_dataset, batch_size=16, sampler=train_sampler, num_workers=4 )

3. 损失函数优化

# IoU 系列损失函数 class IoULoss(nn.Module): def __init__(self, loss_type="ciou"): super().__init__() self.loss_type = loss_type def forward(self, pred_boxes, target_boxes): """ pred_boxes, target_boxes: [N, 4] (x1, y1, x2, y2) """ # 计算交集 inter_x1 = torch.max(pred_boxes[:, 0], target_boxes[:, 0]) inter_y1 = torch.max(pred_boxes[:, 1], target_boxes[:, 1]) inter_x2 = torch.min(pred_boxes[:, 2], target_boxes[:, 2]) inter_y2 = torch.min(pred_boxes[:, 3], target_boxes[:, 3]) inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * \ torch.clamp(inter_y2 - inter_y1, min=0) # 计算并集 pred_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * \ (pred_boxes[:, 3] - pred_boxes[:, 1]) target_area = (target_boxes[:, 2] - target_boxes[:, 0]) * \ (target_boxes[:, 3] - target_boxes[:, 1]) union_area = pred_area + target_area - inter_area # 基础 IoU iou = inter_area / (union_area + 1e-7) if self.loss_type == "iou": loss = 1 - iou elif self.loss_type == "giou": # 计算最小外接框 enclose_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0]) enclose_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1]) enclose_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2]) enclose_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3]) enclose_area = (enclose_x2 - enclose_x1) * (enclose_y2 - enclose_y1) giou = iou - (enclose_area - union_area) / enclose_area loss = 1 - giou elif self.loss_type == "diou": # 中心点距离 pred_cx = (pred_boxes[:, 0] + pred_boxes[:, 2]) / 2 pred_cy = (pred_boxes[:, 1] + pred_boxes[:, 3]) / 2 target_cx = (target_boxes[:, 0] + target_boxes[:, 2]) / 2 target_cy = (target_boxes[:, 1] + target_boxes[:, 3]) / 2 center_dist = (pred_cx - target_cx) ** 2 + (pred_cy - target_cy) ** 2 # 对角线距离 enclose_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0]) enclose_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1]) enclose_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2]) enclose_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3]) diagonal_dist = (enclose_x2 - enclose_x1) ** 2 + \ (enclose_y2 - enclose_y1) ** 2 diou = iou - center_dist / (diagonal_dist + 1e-7) loss = 1 - diou elif self.loss_type == "ciou": # 在 DIoU 基础上添加长宽比惩罚 # 实现略... pass return loss.mean() # Focal Loss 处理类别不均衡 class FocalLoss(nn.Module): def __init__(self, alpha=0.25, gamma=2.0): super().__init__() self.alpha = alpha self.gamma = gamma def forward(self, pred, target): """ pred: [N, num_classes] 预测的类别概率 target: [N] 真实类别 """ ce_loss = F.cross_entropy(pred, target, reduction='none') p_t = torch.exp(-ce_loss) # Focal 权重 focal_weight = (1 - p_t) ** self.gamma # Alpha 权重(处理正负样本不均衡) alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target) loss = alpha_t * focal_weight * ce_loss return loss.mean()

三、推理与后处理

1. Test-Time Augmentation (TTA)

class TTA: def __init__(self, model, scales=[0.8, 1.0, 1.2], flips=[False, True]): self.model = model self.scales = scales self.flips = flips def __call__(self, image): predictions = [] for scale in self.scales: for flip in self.flips: # 缩放 scaled_img = F.interpolate( image, scale_factor=scale, mode='bilinear' ) # 翻转 if flip: scaled_img = torch.flip(scaled_img, dims=[-1]) # 预测 with torch.no_grad(): pred = self.model(scaled_img) # 还原预测框 if flip: pred[..., 0] = 1 - pred[..., 0] # x 坐标翻转 pred[..., :4] /= scale # 缩放还原 predictions.append(pred) # 合并所有预测 return self.merge_predictions(predictions) def merge_predictions(self, predictions): """ 使用 NMS 或投票机制合并多个预测 """ all_boxes = [] all_scores = [] all_classes = [] for pred in predictions: boxes, scores, classes = self.decode_prediction(pred) all_boxes.append(boxes) all_scores.append(scores) all_classes.append(classes) # 合并所有预测 all_boxes = torch.cat(all_boxes, dim=0) all_scores = torch.cat(all_scores, dim=0) all_classes = torch.cat(all_classes, dim=0) # NMS keep = self.nms(all_boxes, all_scores, iou_threshold=0.5) return all_boxes[keep], all_scores[keep], all_classes[keep]

2. 改进的 NMS

def soft_nms(boxes, scores, iou_threshold=0.5, sigma=0.5, score_threshold=0.001): """ Soft-NMS: 不完全抑制重叠框,而是降低其分数 """ N = boxes.shape[0] indexes = torch.arange(N) # 按分数排序 scores, order = scores.sort(descending=True) boxes = boxes[order] keep = [] while order.numel() > 0: if order.numel() == 1: keep.append(order[0]) break # 保留最高分的框 i = order[0] keep.append(i) # 计算 IoU xx1 = torch.max(boxes[i, 0], boxes[order[1:], 0]) yy1 = torch.max(boxes[i, 1], boxes[order[1:], 1]) xx2 = torch.min(boxes[i, 2], boxes[order[1:], 2]) yy2 = torch.min(boxes[i, 3], boxes[order[1:], 3]) w = torch.clamp(xx2 - xx1, min=0) h = torch.clamp(yy2 - yy1, min=0) inter = w * h areas_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1]) areas = (boxes[order[1:], 2] - boxes[order[1:], 0]) * \ (boxes[order[1:], 3] - boxes[order[1:], 1]) iou = inter / (areas_i + areas - inter) # Soft-NMS: 根据 IoU 降低分数 scores[order[1:]] *= torch.exp(-(iou ** 2) / sigma) # 保留分数高于阈值的框 idx = torch.where(scores[order[1:]] > score_threshold)[0] order = order[idx + 1] return keep def diou_nms(boxes, scores, iou_threshold=0.5): """ DIoU-NMS: 考虑中心点距离的 NMS """ keep = [] order = scores.argsort(descending=True) while order.numel() > 0: i = order[0] keep.append(i) if order.numel() == 1: break # 计算 IoU iou = box_iou(boxes[i].unsqueeze(0), boxes[order[1:]])[0] # 计算中心点距离 center_i = (boxes[i, :2] + boxes[i, 2:]) / 2 centers = (boxes[order[1:], :2] + boxes[order[1:], 2:]) / 2 center_distance = ((center_i - centers) ** 2).sum(dim=1).sqrt() # 计算对角线距离 enclose_mins = torch.min(boxes[i, :2], boxes[order[1:], :2]) enclose_maxs = torch.max(boxes[i, 2:], boxes[order[1:], 2:]) diagonal = ((enclose_maxs - enclose_mins) ** 2).sum(dim=1).sqrt() # DIoU diou = iou - (center_distance / diagonal) ** 2 # 保留 DIoU 小于阈值的框 idx = torch.where(diou < iou_threshold)[0] order = order[idx + 1] return keep

四、工程实战技巧

1. 数据标注质量控制

class LabelQualityChecker: def __init__(self, min_size=2, max_aspect_ratio=10): self.min_size = min_size self.max_aspect_ratio = max_aspect_ratio def check_dataset(self, dataset): """ 检查数据集标注质量 """ issues = [] for idx, (image, labels) in enumerate(dataset): h, w = image.shape[:2] for label_idx, label in enumerate(labels): cls, x, y, bw, bh = label # 检查边界框尺寸 pixel_w = bw * w pixel_h = bh * h if pixel_w < self.min_size or pixel_h < self.min_size: issues.append({ 'type': 'too_small', 'image_idx': idx, 'label_idx': label_idx, 'size': (pixel_w, pixel_h) }) # 检查长宽比 aspect_ratio = max(pixel_w / pixel_h, pixel_h / pixel_w) if aspect_ratio > self.max_aspect_ratio: issues.append({ 'type': 'extreme_aspect_ratio', 'image_idx': idx, 'label_idx': label_idx, 'aspect_ratio': aspect_ratio }) # 检查边界 if x - bw/2 < 0 or x + bw/2 > 1 or \ y - bh/2 < 0 or y + bh/2 > 1: issues.append({ 'type': 'out_of_bounds', 'image_idx': idx, 'label_idx': label_idx }) return issues # 半自动标注辅助 class SemiAutoLabeling: def __init__(self, model, confidence_threshold=0.7): self.model = model self.confidence_threshold = confidence_threshold def generate_pseudo_labels(self, unlabeled_images): """ 使用训练好的模型生成伪标签 """ pseudo_labels = [] for image in unlabeled_images: with torch.no_grad(): predictions = self.model(image) # 筛选高置信度预测 boxes, scores, classes = self.decode_predictions(predictions) high_conf_mask = scores > self.confidence_threshold boxes = boxes[high_conf_mask] scores = scores[high_conf_mask] classes = classes[high_conf_mask] # 人工审核标记 labels = [] for box, score, cls in zip(boxes, scores, classes): labels.append({ 'box': box, 'class': cls, 'score': score, 'need_review': score < 0.9 # 标记需要审核的 }) pseudo_labels.append(labels) return pseudo_labels

2. 混合模型策略

class SuperResolutionDetector: def __init__(self, sr_model, detection_model, sr_scale=2): self.sr_model = sr_model # 超分辨率模型 self.detection_model = detection_model self.sr_scale = sr_scale def detect(self, image, use_sr_threshold=32): """ 对小目标区域使用超分辨率 """ # 第一次检测,获取初步结果 with torch.no_grad(): initial_detections = self.detection_model(image) boxes, scores, classes = self.decode_detections(initial_detections) # 识别小目标区域 small_regions = [] h, w = image.shape[-2:] for box in boxes: box_w = (box[2] - box[0]) * w box_h = (box[3] - box[1]) * h if box_w < use_sr_threshold or box_h < use_sr_threshold: # 扩展区域(添加 padding) x1 = max(0, int(box[0] * w - 10)) y1 = max(0, int(box[1] * h - 10)) x2 = min(w, int(box[2] * w + 10)) y2 = min(h, int(box[3] * h + 10)) small_regions.append((x1, y1, x2, y2)) # 对小目标区域进行超分辨率 enhanced_detections = [] for x1, y1, x2, y2 in small_regions: # 裁剪区域 region = image[:, :, y1:y2, x1:x2] # 超分辨率 with torch.no_grad(): sr_region = self.sr_model(region) # 在增强的区域上重新检测 with torch.no_grad(): region_detections = self.detection_model(sr_region) # 将检测结果映射回原图坐标 region_boxes, region_scores, region_classes = \ self.decode_detections(region_detections) # 坐标转换 scale = 1 / self.sr_scale for box in region_boxes: box[0] = (box[0] * (x2 - x1) * scale + x1) / w box[1] = (box[1] * (y2 - y1) * scale + y1) / h box[2] = (box[2] * (x2 - x1) * scale + x1) / w box[3] = (box[3] * (y2 - y1) * scale + y1) / h enhanced_detections.append((region_boxes, region_scores, region_classes)) # 合并所有检测结果 return self.merge_detections(initial_detections, enhanced_detections)

3. 轻量化分支设计

class LightweightSmallObjectBranch(nn.Module): def __init__(self, in_channels=64): super().__init__() # 轻量级卷积(深度可分离) self.dw_conv = nn.Conv2d( in_channels, in_channels, 3, padding=1, groups=in_channels ) self.pw_conv = nn.Conv2d(in_channels, in_channels // 2, 1) # 注意力模块 self.attention = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Conv2d(in_channels // 2, in_channels // 8, 1), nn.ReLU(inplace=True), nn.Conv2d(in_channels // 8, in_channels // 2, 1), nn.Sigmoid() ) # 检测头 self.detect = nn.Conv2d(in_channels // 2, 5, 1) # 4 bbox + 1 obj def forward(self, x): # 深度可分离卷积 x = self.dw_conv(x) x = self.pw_conv(x) # 通道注意力 att = self.attention(x) x = x * att # 检测输出 return self.detect(x) # 整合到主模型 class YOLOWithSmallObjectBranch(nn.Module): def __init__(self, base_model): super().__init__() self.base_model = base_model # 在浅层特征上添加小目标分支 self.small_obj_branch = LightweightSmallObjectBranch(64) def forward(self, x): # 提取多尺度特征 features = self.base_model.extract_features(x) # 浅层特征用于小目标检测 small_obj_pred = self.small_obj_branch(features[0]) # 最浅层 # 常规检测 regular_pred = self.base_model.detect(features) return { 'small': small_obj_pred, 'regular': regular_pred }

五、各版本 YOLO 的特定优化

YOLOv3-v5 特定配置

# yolov5s-small-objects.yaml nc: 80 # 类别数 depth_multiple: 0.33 width_multiple: 0.50 # anchors - 针对小目标优化 anchors: - [4,5, 6,8, 9,12] # P2/4 - 新增,极小目标 - [10,13, 16,30, 33,23] # P3/8 - 小目标 - [30,61, 62,45, 59,119] # P4/16 - 中目标 - [116,90, 156,198, 373,326] # P5/32 - 大目标 # backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 - 保留用于检测 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]]] # 9 # head - 包含 P2 检测层 head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [128, 1, 1]], # 新增 P2 上采样 [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 2], 1, Concat, [1]], # cat backbone P2 [-1, 3, C3, [128, False]], # 21 (P2/4-xsmall) [-1, 1, Conv, [128, 3, 2]], [[-1, 18], 1, Concat, [1]], # cat head P3 [-1, 3, C3, [256, False]], # 24 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 27 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 30 (P5/32-large) [[21, 24, 27, 30], 1, Detect, [nc, anchors]]] # 检测层包含 P2

YOLOv8 特定优化

# YOLOv8 小目标优化配置 def create_yolov8_small_object_config(): return { # 模型配置 'model': { 'nc': 80, 'scales': { 'n': [0.33, 0.25], # depth, width 's': [0.33, 0.50], 'm': [0.67, 0.75], 'l': [1.00, 1.00], 'x': [1.00, 1.25], } }, # 训练配置 'train': { 'imgsz': 1280, # 更高分辨率 'batch': 16, 'epochs': 300, 'patience': 50, 'optimizer': 'AdamW', 'lr0': 0.001, 'lrf': 0.01, 'momentum': 0.937, 'weight_decay': 0.0005, 'warmup_epochs': 3.0, 'warmup_momentum': 0.8, 'warmup_bias_lr': 0.1, # 损失权重 - 提高边界框损失权重 'box': 7.5, # 默认 7.5 'cls': 0.5, # 默认 0.5 'dfl': 1.5, # 默认 1.5 # 数据增强 'hsv_h': 0.015, 'hsv_s': 0.7, 'hsv_v': 0.4, 'degrees': 0.0, # 小目标避免过度旋转 'translate': 0.1, 'scale': 0.5, 'shear': 0.0, 'perspective': 0.0, 'flipud': 0.0, 'fliplr': 0.5, 'mosaic': 1.0, 'mixup': 0.1, 'copy_paste': 0.3, # Copy-Paste 增强 }, # 验证配置 'val': { 'imgsz': 1280, 'conf': 0.001, # 降低置信度阈值 'iou': 0.6, 'max_det': 300, } }

六、性能评估与调试

1. 小目标专用评估指标

def evaluate_small_objects(predictions, ground_truths, size_ranges): """ 分尺寸评估检测性能 size_ranges: [(0, 32), (32, 96), (96, float('inf'))] """ results = {} for size_min, size_max in size_ranges: # 筛选该尺寸范围的真实框 size_gts = [] for gt in ground_truths: area = gt['width'] * gt['height'] if size_min <= area < size_max: size_gts.append(gt) # 计算该尺寸范围的 AP if len(size_gts) > 0: ap = calculate_ap(predictions, size_gts) results[f'{size_min}-{size_max}'] = ap else: results[f'{size_min}-{size_max}'] = 0.0 return results # COCO 风格的评估 class COCOEvaluator: def __init__(self): self.area_ranges = [ (0, 32**2), # small (32**2, 96**2), # medium (96**2, 1e5**2) # large ] def evaluate(self, dataset, model): # 收集所有预测和真实值 all_predictions = [] all_ground_truths = [] for image, gt in dataset: pred = model(image) all_predictions.extend(pred) all_ground_truths.extend(gt) # 计算各项指标 metrics = { 'mAP': self.calculate_map(all_predictions, all_ground_truths), 'mAP_50': self.calculate_map(all_predictions, all_ground_truths, iou_thresh=0.5), 'mAP_75': self.calculate_map(all_predictions, all_ground_truths, iou_thresh=0.75), } # 分尺寸评估 for i, (min_area, max_area) in enumerate(self.area_ranges): area_name = ['small', 'medium', 'large'][i] metrics[f'mAP_{area_name}'] = self.calculate_map_by_area( all_predictions, all_ground_truths, min_area, max_area ) return metrics

2. 可视化调试工具

import cv2 import matplotlib.pyplot as plt def visualize_small_object_detections(image, predictions, ground_truths, size_threshold=32): """ 可视化小目标检测结果 """ fig, axes = plt.subplots(1, 3, figsize=(15, 5)) # 原图 + 所有检测 ax1 = axes[0] ax1.imshow(image) ax1.set_title('All Detections') for pred in predictions: rect = plt.Rectangle( (pred['x'], pred['y']), pred['w'], pred['h'], fill=False, color='red', linewidth=1 ) ax1.add_patch(rect) # 小目标检测 ax2 = axes[1] ax2.imshow(image) ax2.set_title(f'Small Objects (<{size_threshold}px)') for pred in predictions: if pred['w'] < size_threshold or pred['h'] < size_threshold: rect = plt.Rectangle( (pred['x'], pred['y']), pred['w'], pred['h'], fill=False, color='yellow', linewidth=2 ) ax2.add_patch(rect) # 漏检分析 ax3 = axes[2] ax3.imshow(image) ax3.set_title('Missed Detections') for gt in ground_truths: matched = False for pred in predictions: iou = calculate_iou(gt, pred) if iou > 0.5: matched = True break if not matched: rect = plt.Rectangle( (gt['x'], gt['y']), gt['w'], gt['h'], fill=False, color='blue', linewidth=2 ) ax3.add_patch(rect) plt.tight_layout() plt.show() # 热力图分析 def generate_detection_heatmap(model, image, stride=8): """ 生成检测响应热力图 """ h, w = image.shape[:2] heatmap = np.zeros((h // stride, w // stride)) # 滑窗检测 for y in range(0, h - stride, stride): for x in range(0, w - stride, stride): patch = image[y:y+stride*2, x:x+stride*2] with torch.no_grad(): response = model.get_objectness(patch) heatmap[y//stride, x//stride] = response.max() # 上采样到原图大小 heatmap = cv2.resize(heatmap, (w, h)) # 叠加显示 plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.imshow(image) plt.title('Original Image') plt.subplot(1, 2, 2) plt.imshow(image) plt.imshow(heatmap, alpha=0.5, cmap='jet') plt.colorbar() plt.title('Detection Heatmap') plt.show()

七、最佳实践总结

场景化配置推荐

场景推荐配置
医疗细菌检测YOLOv8 + P2层 + 1280输入 + Copy-Paste + Soft-NMS
无人机小目标YOLOv5 + 自定义anchor + TTA + 超分辨率
PCB缺陷检测YOLOv8 + Decoupled Head + 多尺度训练 + DIoU-NMS
卫星遥感YOLOv10 + 切片检测 + 全局-局部融合

调优优先级

  1. 数据层面(效果最明显)

    • 提高标注质量
    • 增加小目标样本
    • Copy-Paste 增强
  2. 模型结构(次优先)

    • 添加 P2 检测层
    • 使用 PANet/BiFPN
    • Decoupled Head
  3. 训练策略

    • 多尺度训练
    • Mosaic 增强
    • 适当的损失权重
  4. 后处理

    • Soft-NMS
    • TTA
    • 置信度阈值调整

常见问题与解决

  1. 漏检率高

    • 降低置信度阈值
    • 增加输入分辨率
    • 使用 TTA
  2. 误检率高

    • 提高 NMS 阈值
    • 加强负样本训练
    • 使用 Focal Loss
  3. 速度慢

    • 使用轻量级分支
    • 减少 TTA scales
    • 模型剪枝/量化
  4. 显存不足

    • 减小 batch size
    • 使用梯度累积
    • 混合精度训练

通过合理组合这些优化策略,YOLO 系列可以在小目标检测任务上达到优秀的性能。关键是根据具体场景选择合适的优化组合,并通过实验不断调整。

Last updated on