YOLO 系列小目标检测优化完全指南

小目标检测一直是目标检测领域的难点，本文系统整理了 YOLO 系列（v3-v10）针对小目标的优化策略，从模型结构、训练技巧到工程实践，提供完整的解决方案。

小目标检测为什么难？

技术挑战

特征表达不足：经过多次下采样后，小目标的特征几乎消失
感受野不匹配：深层网络的大感受野不适合小目标
正负样本不均衡：小目标的正样本少，训练困难
定位精度要求高：几个像素的偏差对小目标影响巨大

实际场景挑战

医疗影像：细胞、细菌等微小病灶
无人机航拍：高空视角下的行人、车辆
卫星遥感：建筑物、船只等目标
工业质检：PCB 板上的微小缺陷

一、模型结构改进

1. 多尺度特征融合

FPN 到 PANet 的演进


# YOLOv3 的 FPN 结构
class FPN(nn.Module):
    def __init__(self, in_channels_list, out_channels):
        super().__init__()
        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()
        
        for in_channels in in_channels_list:
            self.lateral_convs.append(
                nn.Conv2d(in_channels, out_channels, 1)
            )
            self.fpn_convs.append(
                nn.Conv2d(out_channels, out_channels, 3, padding=1)
            )
    
    def forward(self, features):
        # features: [P3, P4, P5] from backbone
        laterals = [conv(f) for conv, f in zip(self.lateral_convs, features)]
        
        # 自顶向下融合
        for i in range(len(laterals) - 1, 0, -1):
            laterals[i - 1] += F.interpolate(
                laterals[i], scale_factor=2, mode='nearest'
            )
        
        # 输出特征
        outs = [conv(lat) for conv, lat in zip(self.fpn_convs, laterals)]
        return outs


# YOLOv4/v5 的 PANet 结构
class PANet(nn.Module):
    def __init__(self):
        super().__init__()
        # 自顶向下路径（FPN）
        self.fpn_p5_to_p4 = nn.Conv2d(512, 256, 1)
        self.fpn_p4_to_p3 = nn.Conv2d(256, 128, 1)
        
        # 自底向上路径（PAN）
        self.pan_p3_to_p4 = nn.Conv2d(128, 256, 3, stride=2, padding=1)
        self.pan_p4_to_p5 = nn.Conv2d(256, 512, 3, stride=2, padding=1)
        
    def forward(self, features):
        p3, p4, p5 = features
        
        # FPN 自顶向下
        p5_up = F.interpolate(self.fpn_p5_to_p4(p5), scale_factor=2)
        p4 = p4 + p5_up
        
        p4_up = F.interpolate(self.fpn_p4_to_p3(p4), scale_factor=2)
        p3 = p3 + p4_up
        
        # PAN 自底向上
        p3_down = self.pan_p3_to_p4(p3)
        p4 = p4 + p3_down
        
        p4_down = self.pan_p4_to_p5(p4)
        p5 = p5 + p4_down
        
        return [p3, p4, p5]  # 多尺度输出

增加更浅层特征（P2 检测层）


# 添加 P2 检测层配置
class YOLOv5_P2(nn.Module):
    def __init__(self):
        super().__init__()
        # 原有的 P3, P4, P5
        self.detect_p3 = DetectHead(128, num_classes=80)  # stride=8
        self.detect_p4 = DetectHead(256, num_classes=80)  # stride=16
        self.detect_p5 = DetectHead(512, num_classes=80)  # stride=32
        
        # 新增 P2 检测层
        self.detect_p2 = DetectHead(64, num_classes=80)   # stride=4
        
        # P2 特征提取
        self.p2_conv = nn.Sequential(
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        # x 包含来自 backbone 的 [P2, P3, P4, P5] 特征
        p2, p3, p4, p5 = x
        
        # P2 特征增强
        p2 = self.p2_conv(p2)
        
        # 多尺度检测
        outputs = []
        outputs.append(self.detect_p2(p2))  # 160×160 用于极小目标
        outputs.append(self.detect_p3(p3))  # 80×80 用于小目标
        outputs.append(self.detect_p4(p4))  # 40×40 用于中目标
        outputs.append(self.detect_p5(p5))  # 20×20 用于大目标
        
        return outputs

2. 更高分辨率输入


# 动态调整输入分辨率
class MultiScaleTraining:
    def __init__(self, base_size=640, scales=[0.5, 0.75, 1.0, 1.25, 1.5]):
        self.base_size = base_size
        self.scales = scales
        
    def get_size(self, epoch):
        # 渐进式提高分辨率
        if epoch < 50:
            return 640
        elif epoch < 100:
            return 896
        elif epoch < 150:
            return 1024
        else:
            # 随机多尺度
            scale = random.choice(self.scales)
            return int(self.base_size * scale)
 
# 使用示例
def train_with_dynamic_resolution(model, dataloader, epochs):
    scale_scheduler = MultiScaleTraining()
    
    for epoch in range(epochs):
        input_size = scale_scheduler.get_size(epoch)
        
        # 调整数据加载器的输入尺寸
        dataloader.dataset.img_size = input_size
        
        for images, targets in dataloader:
            # 根据新尺寸调整图像
            images = F.interpolate(images, size=(input_size, input_size))
            
            # 训练步骤
            outputs = model(images)
            loss = compute_loss(outputs, targets)
            loss.backward()

3. Anchor 优化（YOLOv3-v7）


import numpy as np
from sklearn.cluster import KMeans
 
def optimize_anchors_for_small_objects(dataset, num_anchors=9, img_size=640):
    """
    针对小目标数据集优化 anchor
    """
    # 收集所有边界框
    boxes = []
    for _, targets in dataset:
        for target in targets:
            w, h = target[3] * img_size, target[4] * img_size
            boxes.append([w, h])
    
    boxes = np.array(boxes)
    
    # 过滤小目标（例如面积 < 32×32）
    small_boxes = boxes[boxes[:, 0] * boxes[:, 1] < 32 * 32]
    
    # K-means 聚类
    kmeans = KMeans(n_clusters=num_anchors, random_state=0)
    kmeans.fit(small_boxes)
    
    # 获取聚类中心作为 anchor
    anchors = kmeans.cluster_centers_
    
    # 按面积排序
    areas = anchors[:, 0] * anchors[:, 1]
    sorted_idx = np.argsort(areas)
    anchors = anchors[sorted_idx]
    
    print("优化后的小目标 anchors:")
    for i, (w, h) in enumerate(anchors):
        print(f"  Anchor {i+1}: {w:.1f} x {h:.1f}")
    
    return anchors
 
# 使用 IoU 距离的改进版本
def kmeans_iou(boxes, num_anchors):
    """
    使用 IoU 作为距离度量的 K-means
    """
    def iou_distance(box, anchor):
        # 计算 IoU（假设都从原点开始）
        intersection = min(box[0], anchor[0]) * min(box[1], anchor[1])
        union = box[0] * box[1] + anchor[0] * anchor[1] - intersection
        return 1 - intersection / union
    
    # 初始化 anchors
    anchors = boxes[np.random.choice(len(boxes), num_anchors, replace=False)]
    
    for _ in range(100):  # 迭代次数
        # 分配每个 box 到最近的 anchor
        distances = np.array([
            [iou_distance(box, anchor) for anchor in anchors]
            for box in boxes
        ])
        assignments = np.argmin(distances, axis=1)
        
        # 更新 anchors
        new_anchors = []
        for i in range(num_anchors):
            cluster_boxes = boxes[assignments == i]
            if len(cluster_boxes) > 0:
                # 使用中位数而非平均值，更鲁棒
                new_anchors.append(np.median(cluster_boxes, axis=0))
            else:
                new_anchors.append(anchors[i])
        
        anchors = np.array(new_anchors)
    
    return anchors

4. 改进的检测头


# Decoupled Head（解耦头）- YOLOv6/v8 采用
class DecoupledHead(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        # 分类分支
        self.cls_conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True)
        )
        self.cls_pred = nn.Conv2d(in_channels, num_classes, 1)
        
        # 回归分支
        self.reg_conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True)
        )
        self.reg_pred = nn.Conv2d(in_channels, 4, 1)  # 边界框回归
        
        # IoU 分支（可选）
        self.iou_pred = nn.Conv2d(in_channels, 1, 1)  # IoU 预测
    
    def forward(self, x):
        # 分类
        cls_feat = self.cls_conv(x)
        cls_output = self.cls_pred(cls_feat)
        
        # 回归
        reg_feat = self.reg_conv(x)
        reg_output = self.reg_pred(reg_feat)
        iou_output = self.iou_pred(reg_feat)
        
        return cls_output, reg_output, iou_output
 
# RepPoints 风格的可变形检测
class DeformableHead(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        self.offset_conv = nn.Conv2d(in_channels, 18, 3, padding=1)  # 9 个采样点的偏移
        self.deform_conv = DeformConv2d(in_channels, in_channels, 3, padding=1)
        self.cls_conv = nn.Conv2d(in_channels, num_classes, 1)
        self.reg_conv = nn.Conv2d(in_channels, 4, 1)
    
    def forward(self, x):
        # 学习采样点偏移
        offsets = self.offset_conv(x)
        
        # 可变形卷积
        x = self.deform_conv(x, offsets)
        
        # 预测
        cls = self.cls_conv(x)
        reg = self.reg_conv(x)
        
        return cls, reg

二、训练技巧

1. 数据增强策略

Mosaic 数据增强


class MosaicAugmentation:
    def __init__(self, img_size=640, prob=1.0):
        self.img_size = img_size
        self.prob = prob
    
    def __call__(self, images, targets):
        if random.random() > self.prob:
            return images[0], targets[0]
        
        # 随机选择 4 张图片
        if len(images) < 4:
            images = images * 4
            targets = targets * 4
        
        indices = random.sample(range(len(images)), 4)
        imgs = [images[i] for i in indices]
        labels = [targets[i] for i in indices]
        
        # 创建 2x2 拼接
        h, w = self.img_size // 2, self.img_size // 2
        
        # 初始化大图
        mosaic_img = np.zeros((self.img_size, self.img_size, 3), dtype=np.float32)
        mosaic_labels = []
        
        # 随机中心点
        cx = random.randint(w // 2, w * 3 // 2)
        cy = random.randint(h // 2, h * 3 // 2)
        
        for i, (img, label) in enumerate(zip(imgs, labels)):
            # 调整图片大小
            img = cv2.resize(img, (w, h))
            
            # 确定放置位置
            if i == 0:  # 左上
                x1, y1, x2, y2 = 0, 0, cx, cy
                sx1, sy1, sx2, sy2 = w - cx, h - cy, w, h
            elif i == 1:  # 右上
                x1, y1, x2, y2 = cx, 0, self.img_size, cy
                sx1, sy1, sx2, sy2 = 0, h - cy, w - cx, h
            elif i == 2:  # 左下
                x1, y1, x2, y2 = 0, cy, cx, self.img_size
                sx1, sy1, sx2, sy2 = w - cx, 0, w, h - cy
            else:  # 右下
                x1, y1, x2, y2 = cx, cy, self.img_size, self.img_size
                sx1, sy1, sx2, sy2 = 0, 0, w - cx, h - cy
            
            # 放置图片片段
            mosaic_img[y1:y2, x1:x2] = img[sy1:sy2, sx1:sx2]
            
            # 调整标签
            for cls, x, y, w, h in label:
                # 转换到 mosaic 坐标
                x = (x * (sx2 - sx1) + x1) / self.img_size
                y = (y * (sy2 - sy1) + y1) / self.img_size
                w = w * (sx2 - sx1) / self.img_size
                h = h * (sy2 - sy1) / self.img_size
                
                # 裁剪到图像边界
                x = np.clip(x, 0, 1)
                y = np.clip(y, 0, 1)
                w = min(w, 1 - x)
                h = min(h, 1 - y)
                
                if w > 0 and h > 0:
                    mosaic_labels.append([cls, x, y, w, h])
        
        return mosaic_img, np.array(mosaic_labels)

Copy-Paste 增强


class CopyPasteAugmentation:
    def __init__(self, prob=0.5, max_objects=30):
        self.prob = prob
        self.max_objects = max_objects
    
    def __call__(self, image, labels, paste_objects):
        """
        paste_objects: 预先裁剪好的小目标库
        """
        if random.random() > self.prob:
            return image, labels
        
        h, w = image.shape[:2]
        new_labels = labels.copy()
        
        # 随机选择要粘贴的小目标
        n_paste = random.randint(1, min(self.max_objects, len(paste_objects)))
        selected = random.sample(paste_objects, n_paste)
        
        for obj in selected:
            obj_img, obj_mask, obj_label = obj
            oh, ow = obj_img.shape[:2]
            
            # 随机位置（避免边界）
            max_x = w - ow
            max_y = h - oh
            if max_x <= 0 or max_y <= 0:
                continue
            
            x = random.randint(0, max_x)
            y = random.randint(0, max_y)
            
            # 检查重叠（可选）
            overlap = self.check_overlap(new_labels, x, y, ow, oh)
            if overlap > 0.3:  # 重叠阈值
                continue
            
            # 粘贴对象
            image[y:y+oh, x:x+ow] = \
                image[y:y+oh, x:x+ow] * (1 - obj_mask) + obj_img * obj_mask
            
            # 添加标签
            cx = (x + ow/2) / w
            cy = (y + oh/2) / h
            bw = ow / w
            bh = oh / h
            new_labels = np.vstack([new_labels, [obj_label, cx, cy, bw, bh]])
        
        return image, new_labels
    
    def check_overlap(self, labels, x, y, w, h):
        # 检查与现有目标的重叠度
        # 实现略...
        return 0.0

2. 小目标过采样


from torch.utils.data import WeightedRandomSampler
 
def create_weighted_sampler(dataset, size_threshold=32*32):
    """
    为包含小目标的图像增加采样权重
    """
    weights = []
    
    for idx in range(len(dataset)):
        _, targets = dataset[idx]
        
        # 计算小目标数量
        small_obj_count = 0
        for target in targets:
            # 假设 target 格式为 [class, x, y, w, h]
            area = target[3] * target[4] * dataset.img_size ** 2
            if area < size_threshold:
                small_obj_count += 1
        
        # 权重与小目标数量成正比
        weight = 1.0 + small_obj_count * 0.5  # 可调整系数
        weights.append(weight)
    
    # 创建采样器
    sampler = WeightedRandomSampler(
        weights=weights,
        num_samples=len(dataset),
        replacement=True
    )
    
    return sampler
 
# 使用示例
train_sampler = create_weighted_sampler(train_dataset)
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    sampler=train_sampler,
    num_workers=4
)

3. 损失函数优化


# IoU 系列损失函数
class IoULoss(nn.Module):
    def __init__(self, loss_type="ciou"):
        super().__init__()
        self.loss_type = loss_type
    
    def forward(self, pred_boxes, target_boxes):
        """
        pred_boxes, target_boxes: [N, 4] (x1, y1, x2, y2)
        """
        # 计算交集
        inter_x1 = torch.max(pred_boxes[:, 0], target_boxes[:, 0])
        inter_y1 = torch.max(pred_boxes[:, 1], target_boxes[:, 1])
        inter_x2 = torch.min(pred_boxes[:, 2], target_boxes[:, 2])
        inter_y2 = torch.min(pred_boxes[:, 3], target_boxes[:, 3])
        
        inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * \
                     torch.clamp(inter_y2 - inter_y1, min=0)
        
        # 计算并集
        pred_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * \
                    (pred_boxes[:, 3] - pred_boxes[:, 1])
        target_area = (target_boxes[:, 2] - target_boxes[:, 0]) * \
                      (target_boxes[:, 3] - target_boxes[:, 1])
        
        union_area = pred_area + target_area - inter_area
        
        # 基础 IoU
        iou = inter_area / (union_area + 1e-7)
        
        if self.loss_type == "iou":
            loss = 1 - iou
        
        elif self.loss_type == "giou":
            # 计算最小外接框
            enclose_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0])
            enclose_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1])
            enclose_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2])
            enclose_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3])
            
            enclose_area = (enclose_x2 - enclose_x1) * (enclose_y2 - enclose_y1)
            giou = iou - (enclose_area - union_area) / enclose_area
            loss = 1 - giou
        
        elif self.loss_type == "diou":
            # 中心点距离
            pred_cx = (pred_boxes[:, 0] + pred_boxes[:, 2]) / 2
            pred_cy = (pred_boxes[:, 1] + pred_boxes[:, 3]) / 2
            target_cx = (target_boxes[:, 0] + target_boxes[:, 2]) / 2
            target_cy = (target_boxes[:, 1] + target_boxes[:, 3]) / 2
            
            center_dist = (pred_cx - target_cx) ** 2 + (pred_cy - target_cy) ** 2
            
            # 对角线距离
            enclose_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0])
            enclose_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1])
            enclose_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2])
            enclose_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3])
            
            diagonal_dist = (enclose_x2 - enclose_x1) ** 2 + \
                           (enclose_y2 - enclose_y1) ** 2
            
            diou = iou - center_dist / (diagonal_dist + 1e-7)
            loss = 1 - diou
        
        elif self.loss_type == "ciou":
            # 在 DIoU 基础上添加长宽比惩罚
            # 实现略...
            pass
        
        return loss.mean()
 
# Focal Loss 处理类别不均衡
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, pred, target):
        """
        pred: [N, num_classes] 预测的类别概率
        target: [N] 真实类别
        """
        ce_loss = F.cross_entropy(pred, target, reduction='none')
        p_t = torch.exp(-ce_loss)
        
        # Focal 权重
        focal_weight = (1 - p_t) ** self.gamma
        
        # Alpha 权重（处理正负样本不均衡）
        alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target)
        
        loss = alpha_t * focal_weight * ce_loss
        
        return loss.mean()

三、推理与后处理

1. Test-Time Augmentation (TTA)


class TTA:
    def __init__(self, model, scales=[0.8, 1.0, 1.2], flips=[False, True]):
        self.model = model
        self.scales = scales
        self.flips = flips
    
    def __call__(self, image):
        predictions = []
        
        for scale in self.scales:
            for flip in self.flips:
                # 缩放
                scaled_img = F.interpolate(
                    image, 
                    scale_factor=scale, 
                    mode='bilinear'
                )
                
                # 翻转
                if flip:
                    scaled_img = torch.flip(scaled_img, dims=[-1])
                
                # 预测
                with torch.no_grad():
                    pred = self.model(scaled_img)
                
                # 还原预测框
                if flip:
                    pred[..., 0] = 1 - pred[..., 0]  # x 坐标翻转
                
                pred[..., :4] /= scale  # 缩放还原
                
                predictions.append(pred)
        
        # 合并所有预测
        return self.merge_predictions(predictions)
    
    def merge_predictions(self, predictions):
        """
        使用 NMS 或投票机制合并多个预测
        """
        all_boxes = []
        all_scores = []
        all_classes = []
        
        for pred in predictions:
            boxes, scores, classes = self.decode_prediction(pred)
            all_boxes.append(boxes)
            all_scores.append(scores)
            all_classes.append(classes)
        
        # 合并所有预测
        all_boxes = torch.cat(all_boxes, dim=0)
        all_scores = torch.cat(all_scores, dim=0)
        all_classes = torch.cat(all_classes, dim=0)
        
        # NMS
        keep = self.nms(all_boxes, all_scores, iou_threshold=0.5)
        
        return all_boxes[keep], all_scores[keep], all_classes[keep]

2. 改进的 NMS


def soft_nms(boxes, scores, iou_threshold=0.5, sigma=0.5, score_threshold=0.001):
    """
    Soft-NMS: 不完全抑制重叠框，而是降低其分数
    """
    N = boxes.shape[0]
    indexes = torch.arange(N)
    
    # 按分数排序
    scores, order = scores.sort(descending=True)
    boxes = boxes[order]
    
    keep = []
    
    while order.numel() > 0:
        if order.numel() == 1:
            keep.append(order[0])
            break
        
        # 保留最高分的框
        i = order[0]
        keep.append(i)
        
        # 计算 IoU
        xx1 = torch.max(boxes[i, 0], boxes[order[1:], 0])
        yy1 = torch.max(boxes[i, 1], boxes[order[1:], 1])
        xx2 = torch.min(boxes[i, 2], boxes[order[1:], 2])
        yy2 = torch.min(boxes[i, 3], boxes[order[1:], 3])
        
        w = torch.clamp(xx2 - xx1, min=0)
        h = torch.clamp(yy2 - yy1, min=0)
        inter = w * h
        
        areas_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
        areas = (boxes[order[1:], 2] - boxes[order[1:], 0]) * \
                (boxes[order[1:], 3] - boxes[order[1:], 1])
        
        iou = inter / (areas_i + areas - inter)
        
        # Soft-NMS: 根据 IoU 降低分数
        scores[order[1:]] *= torch.exp(-(iou ** 2) / sigma)
        
        # 保留分数高于阈值的框
        idx = torch.where(scores[order[1:]] > score_threshold)[0]
        order = order[idx + 1]
    
    return keep
 
def diou_nms(boxes, scores, iou_threshold=0.5):
    """
    DIoU-NMS: 考虑中心点距离的 NMS
    """
    keep = []
    order = scores.argsort(descending=True)
    
    while order.numel() > 0:
        i = order[0]
        keep.append(i)
        
        if order.numel() == 1:
            break
        
        # 计算 IoU
        iou = box_iou(boxes[i].unsqueeze(0), boxes[order[1:]])[0]
        
        # 计算中心点距离
        center_i = (boxes[i, :2] + boxes[i, 2:]) / 2
        centers = (boxes[order[1:], :2] + boxes[order[1:], 2:]) / 2
        center_distance = ((center_i - centers) ** 2).sum(dim=1).sqrt()
        
        # 计算对角线距离
        enclose_mins = torch.min(boxes[i, :2], boxes[order[1:], :2])
        enclose_maxs = torch.max(boxes[i, 2:], boxes[order[1:], 2:])
        diagonal = ((enclose_maxs - enclose_mins) ** 2).sum(dim=1).sqrt()
        
        # DIoU
        diou = iou - (center_distance / diagonal) ** 2
        
        # 保留 DIoU 小于阈值的框
        idx = torch.where(diou < iou_threshold)[0]
        order = order[idx + 1]
    
    return keep

四、工程实战技巧

1. 数据标注质量控制


class LabelQualityChecker:
    def __init__(self, min_size=2, max_aspect_ratio=10):
        self.min_size = min_size
        self.max_aspect_ratio = max_aspect_ratio
    
    def check_dataset(self, dataset):
        """
        检查数据集标注质量
        """
        issues = []
        
        for idx, (image, labels) in enumerate(dataset):
            h, w = image.shape[:2]
            
            for label_idx, label in enumerate(labels):
                cls, x, y, bw, bh = label
                
                # 检查边界框尺寸
                pixel_w = bw * w
                pixel_h = bh * h
                
                if pixel_w < self.min_size or pixel_h < self.min_size:
                    issues.append({
                        'type': 'too_small',
                        'image_idx': idx,
                        'label_idx': label_idx,
                        'size': (pixel_w, pixel_h)
                    })
                
                # 检查长宽比
                aspect_ratio = max(pixel_w / pixel_h, pixel_h / pixel_w)
                if aspect_ratio > self.max_aspect_ratio:
                    issues.append({
                        'type': 'extreme_aspect_ratio',
                        'image_idx': idx,
                        'label_idx': label_idx,
                        'aspect_ratio': aspect_ratio
                    })
                
                # 检查边界
                if x - bw/2 < 0 or x + bw/2 > 1 or \
                   y - bh/2 < 0 or y + bh/2 > 1:
                    issues.append({
                        'type': 'out_of_bounds',
                        'image_idx': idx,
                        'label_idx': label_idx
                    })
        
        return issues
 
# 半自动标注辅助
class SemiAutoLabeling:
    def __init__(self, model, confidence_threshold=0.7):
        self.model = model
        self.confidence_threshold = confidence_threshold
    
    def generate_pseudo_labels(self, unlabeled_images):
        """
        使用训练好的模型生成伪标签
        """
        pseudo_labels = []
        
        for image in unlabeled_images:
            with torch.no_grad():
                predictions = self.model(image)
            
            # 筛选高置信度预测
            boxes, scores, classes = self.decode_predictions(predictions)
            
            high_conf_mask = scores > self.confidence_threshold
            boxes = boxes[high_conf_mask]
            scores = scores[high_conf_mask]
            classes = classes[high_conf_mask]
            
            # 人工审核标记
            labels = []
            for box, score, cls in zip(boxes, scores, classes):
                labels.append({
                    'box': box,
                    'class': cls,
                    'score': score,
                    'need_review': score < 0.9  # 标记需要审核的
                })
            
            pseudo_labels.append(labels)
        
        return pseudo_labels

2. 混合模型策略


class SuperResolutionDetector:
    def __init__(self, sr_model, detection_model, sr_scale=2):
        self.sr_model = sr_model  # 超分辨率模型
        self.detection_model = detection_model
        self.sr_scale = sr_scale
    
    def detect(self, image, use_sr_threshold=32):
        """
        对小目标区域使用超分辨率
        """
        # 第一次检测，获取初步结果
        with torch.no_grad():
            initial_detections = self.detection_model(image)
        
        boxes, scores, classes = self.decode_detections(initial_detections)
        
        # 识别小目标区域
        small_regions = []
        h, w = image.shape[-2:]
        
        for box in boxes:
            box_w = (box[2] - box[0]) * w
            box_h = (box[3] - box[1]) * h
            
            if box_w < use_sr_threshold or box_h < use_sr_threshold:
                # 扩展区域（添加 padding）
                x1 = max(0, int(box[0] * w - 10))
                y1 = max(0, int(box[1] * h - 10))
                x2 = min(w, int(box[2] * w + 10))
                y2 = min(h, int(box[3] * h + 10))
                
                small_regions.append((x1, y1, x2, y2))
        
        # 对小目标区域进行超分辨率
        enhanced_detections = []
        for x1, y1, x2, y2 in small_regions:
            # 裁剪区域
            region = image[:, :, y1:y2, x1:x2]
            
            # 超分辨率
            with torch.no_grad():
                sr_region = self.sr_model(region)
            
            # 在增强的区域上重新检测
            with torch.no_grad():
                region_detections = self.detection_model(sr_region)
            
            # 将检测结果映射回原图坐标
            region_boxes, region_scores, region_classes = \
                self.decode_detections(region_detections)
            
            # 坐标转换
            scale = 1 / self.sr_scale
            for box in region_boxes:
                box[0] = (box[0] * (x2 - x1) * scale + x1) / w
                box[1] = (box[1] * (y2 - y1) * scale + y1) / h
                box[2] = (box[2] * (x2 - x1) * scale + x1) / w
                box[3] = (box[3] * (y2 - y1) * scale + y1) / h
            
            enhanced_detections.append((region_boxes, region_scores, region_classes))
        
        # 合并所有检测结果
        return self.merge_detections(initial_detections, enhanced_detections)

3. 轻量化分支设计


class LightweightSmallObjectBranch(nn.Module):
    def __init__(self, in_channels=64):
        super().__init__()
        # 轻量级卷积（深度可分离）
        self.dw_conv = nn.Conv2d(
            in_channels, in_channels, 3, 
            padding=1, groups=in_channels
        )
        self.pw_conv = nn.Conv2d(in_channels, in_channels // 2, 1)
        
        # 注意力模块
        self.attention = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(in_channels // 2, in_channels // 8, 1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels // 8, in_channels // 2, 1),
            nn.Sigmoid()
        )
        
        # 检测头
        self.detect = nn.Conv2d(in_channels // 2, 5, 1)  # 4 bbox + 1 obj
    
    def forward(self, x):
        # 深度可分离卷积
        x = self.dw_conv(x)
        x = self.pw_conv(x)
        
        # 通道注意力
        att = self.attention(x)
        x = x * att
        
        # 检测输出
        return self.detect(x)
 
# 整合到主模型
class YOLOWithSmallObjectBranch(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        
        # 在浅层特征上添加小目标分支
        self.small_obj_branch = LightweightSmallObjectBranch(64)
    
    def forward(self, x):
        # 提取多尺度特征
        features = self.base_model.extract_features(x)
        
        # 浅层特征用于小目标检测
        small_obj_pred = self.small_obj_branch(features[0])  # 最浅层
        
        # 常规检测
        regular_pred = self.base_model.detect(features)
        
        return {
            'small': small_obj_pred,
            'regular': regular_pred
        }

五、各版本 YOLO 的特定优化

YOLOv3-v5 特定配置


# yolov5s-small-objects.yaml
nc: 80  # 类别数
depth_multiple: 0.33
width_multiple: 0.50
 
# anchors - 针对小目标优化
anchors:
  - [4,5,  6,8,  9,12]    # P2/4  - 新增，极小目标
  - [10,13, 16,30, 33,23]  # P3/8  - 小目标
  - [30,61, 62,45, 59,119] # P4/16 - 中目标
  - [116,90, 156,198, 373,326]  # P5/32 - 大目标
 
# backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],     # 1-P2/4 - 保留用于检测
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],     # 3-P3/8
   [-1, 6, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],     # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],    # 7-P5/32
   [-1, 3, C3, [1024]],
   [-1, 1, SPPF, [1024, 5]]]       # 9
 
# head - 包含 P2 检测层
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13
   
   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)
   
   [-1, 1, Conv, [128, 1, 1]],  # 新增 P2 上采样
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 2], 1, Concat, [1]],  # cat backbone P2
   [-1, 3, C3, [128, False]],  # 21 (P2/4-xsmall)
   
   [-1, 1, Conv, [128, 3, 2]],
   [[-1, 18], 1, Concat, [1]],  # cat head P3
   [-1, 3, C3, [256, False]],  # 24 (P3/8-small)
   
   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 27 (P4/16-medium)
   
   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 30 (P5/32-large)
   
   [[21, 24, 27, 30], 1, Detect, [nc, anchors]]]  # 检测层包含 P2

YOLOv8 特定优化


# YOLOv8 小目标优化配置
def create_yolov8_small_object_config():
    return {
        # 模型配置
        'model': {
            'nc': 80,
            'scales': {
                'n': [0.33, 0.25],  # depth, width
                's': [0.33, 0.50],
                'm': [0.67, 0.75],
                'l': [1.00, 1.00],
                'x': [1.00, 1.25],
            }
        },
        
        # 训练配置
        'train': {
            'imgsz': 1280,  # 更高分辨率
            'batch': 16,
            'epochs': 300,
            'patience': 50,
            'optimizer': 'AdamW',
            'lr0': 0.001,
            'lrf': 0.01,
            'momentum': 0.937,
            'weight_decay': 0.0005,
            'warmup_epochs': 3.0,
            'warmup_momentum': 0.8,
            'warmup_bias_lr': 0.1,
            
            # 损失权重 - 提高边界框损失权重
            'box': 7.5,  # 默认 7.5
            'cls': 0.5,  # 默认 0.5
            'dfl': 1.5,  # 默认 1.5
            
            # 数据增强
            'hsv_h': 0.015,
            'hsv_s': 0.7,
            'hsv_v': 0.4,
            'degrees': 0.0,  # 小目标避免过度旋转
            'translate': 0.1,
            'scale': 0.5,
            'shear': 0.0,
            'perspective': 0.0,
            'flipud': 0.0,
            'fliplr': 0.5,
            'mosaic': 1.0,
            'mixup': 0.1,
            'copy_paste': 0.3,  # Copy-Paste 增强
        },
        
        # 验证配置
        'val': {
            'imgsz': 1280,
            'conf': 0.001,  # 降低置信度阈值
            'iou': 0.6,
            'max_det': 300,
        }
    }

六、性能评估与调试

1. 小目标专用评估指标


def evaluate_small_objects(predictions, ground_truths, size_ranges):
    """
    分尺寸评估检测性能
    size_ranges: [(0, 32), (32, 96), (96, float('inf'))]
    """
    results = {}
    
    for size_min, size_max in size_ranges:
        # 筛选该尺寸范围的真实框
        size_gts = []
        for gt in ground_truths:
            area = gt['width'] * gt['height']
            if size_min <= area < size_max:
                size_gts.append(gt)
        
        # 计算该尺寸范围的 AP
        if len(size_gts) > 0:
            ap = calculate_ap(predictions, size_gts)
            results[f'{size_min}-{size_max}'] = ap
        else:
            results[f'{size_min}-{size_max}'] = 0.0
    
    return results
 
# COCO 风格的评估
class COCOEvaluator:
    def __init__(self):
        self.area_ranges = [
            (0, 32**2),      # small
            (32**2, 96**2),  # medium
            (96**2, 1e5**2)  # large
        ]
    
    def evaluate(self, dataset, model):
        # 收集所有预测和真实值
        all_predictions = []
        all_ground_truths = []
        
        for image, gt in dataset:
            pred = model(image)
            all_predictions.extend(pred)
            all_ground_truths.extend(gt)
        
        # 计算各项指标
        metrics = {
            'mAP': self.calculate_map(all_predictions, all_ground_truths),
            'mAP_50': self.calculate_map(all_predictions, all_ground_truths, iou_thresh=0.5),
            'mAP_75': self.calculate_map(all_predictions, all_ground_truths, iou_thresh=0.75),
        }
        
        # 分尺寸评估
        for i, (min_area, max_area) in enumerate(self.area_ranges):
            area_name = ['small', 'medium', 'large'][i]
            metrics[f'mAP_{area_name}'] = self.calculate_map_by_area(
                all_predictions, all_ground_truths, min_area, max_area
            )
        
        return metrics

2. 可视化调试工具


import cv2
import matplotlib.pyplot as plt
 
def visualize_small_object_detections(image, predictions, ground_truths, 
                                      size_threshold=32):
    """
    可视化小目标检测结果
    """
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # 原图 + 所有检测
    ax1 = axes[0]
    ax1.imshow(image)
    ax1.set_title('All Detections')
    
    for pred in predictions:
        rect = plt.Rectangle(
            (pred['x'], pred['y']), pred['w'], pred['h'],
            fill=False, color='red', linewidth=1
        )
        ax1.add_patch(rect)
    
    # 小目标检测
    ax2 = axes[1]
    ax2.imshow(image)
    ax2.set_title(f'Small Objects (<{size_threshold}px)')
    
    for pred in predictions:
        if pred['w'] < size_threshold or pred['h'] < size_threshold:
            rect = plt.Rectangle(
                (pred['x'], pred['y']), pred['w'], pred['h'],
                fill=False, color='yellow', linewidth=2
            )
            ax2.add_patch(rect)
    
    # 漏检分析
    ax3 = axes[2]
    ax3.imshow(image)
    ax3.set_title('Missed Detections')
    
    for gt in ground_truths:
        matched = False
        for pred in predictions:
            iou = calculate_iou(gt, pred)
            if iou > 0.5:
                matched = True
                break
        
        if not matched:
            rect = plt.Rectangle(
                (gt['x'], gt['y']), gt['w'], gt['h'],
                fill=False, color='blue', linewidth=2
            )
            ax3.add_patch(rect)
    
    plt.tight_layout()
    plt.show()
 
# 热力图分析
def generate_detection_heatmap(model, image, stride=8):
    """
    生成检测响应热力图
    """
    h, w = image.shape[:2]
    heatmap = np.zeros((h // stride, w // stride))
    
    # 滑窗检测
    for y in range(0, h - stride, stride):
        for x in range(0, w - stride, stride):
            patch = image[y:y+stride*2, x:x+stride*2]
            
            with torch.no_grad():
                response = model.get_objectness(patch)
            
            heatmap[y//stride, x//stride] = response.max()
    
    # 上采样到原图大小
    heatmap = cv2.resize(heatmap, (w, h))
    
    # 叠加显示
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title('Original Image')
    
    plt.subplot(1, 2, 2)
    plt.imshow(image)
    plt.imshow(heatmap, alpha=0.5, cmap='jet')
    plt.colorbar()
    plt.title('Detection Heatmap')
    
    plt.show()

七、最佳实践总结

场景化配置推荐

场景	推荐配置
医疗细菌检测	YOLOv8 + P2层 + 1280输入 + Copy-Paste + Soft-NMS
无人机小目标	YOLOv5 + 自定义anchor + TTA + 超分辨率
PCB缺陷检测	YOLOv8 + Decoupled Head + 多尺度训练 + DIoU-NMS
卫星遥感	YOLOv10 + 切片检测 + 全局-局部融合

调优优先级

数据层面（效果最明显）
- 提高标注质量
- 增加小目标样本
- Copy-Paste 增强
模型结构（次优先）
- 添加 P2 检测层
- 使用 PANet/BiFPN
- Decoupled Head
训练策略
- 多尺度训练
- Mosaic 增强
- 适当的损失权重
后处理
- Soft-NMS
- TTA
- 置信度阈值调整

常见问题与解决

漏检率高
- 降低置信度阈值
- 增加输入分辨率
- 使用 TTA
误检率高
- 提高 NMS 阈值
- 加强负样本训练
- 使用 Focal Loss
速度慢
- 使用轻量级分支
- 减少 TTA scales
- 模型剪枝/量化
显存不足
- 减小 batch size
- 使用梯度累积
- 混合精度训练

通过合理组合这些优化策略，YOLO 系列可以在小目标检测任务上达到优秀的性能。关键是根据具体场景选择合适的优化组合，并通过实验不断调整。