YOLO 系列小目标检测优化完全指南
小目标检测一直是目标检测领域的难点,本文系统整理了 YOLO 系列(v3-v10)针对小目标的优化策略,从模型结构、训练技巧到工程实践,提供完整的解决方案。
小目标检测为什么难?
技术挑战
- 特征表达不足:经过多次下采样后,小目标的特征几乎消失
- 感受野不匹配:深层网络的大感受野不适合小目标
- 正负样本不均衡:小目标的正样本少,训练困难
- 定位精度要求高:几个像素的偏差对小目标影响巨大
实际场景挑战
- 医疗影像:细胞、细菌等微小病灶
- 无人机航拍:高空视角下的行人、车辆
- 卫星遥感:建筑物、船只等目标
- 工业质检:PCB 板上的微小缺陷
一、模型结构改进
1. 多尺度特征融合
FPN 到 PANet 的演进
# YOLOv3 的 FPN 结构
class FPN(nn.Module):
def __init__(self, in_channels_list, out_channels):
super().__init__()
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for in_channels in in_channels_list:
self.lateral_convs.append(
nn.Conv2d(in_channels, out_channels, 1)
)
self.fpn_convs.append(
nn.Conv2d(out_channels, out_channels, 3, padding=1)
)
def forward(self, features):
# features: [P3, P4, P5] from backbone
laterals = [conv(f) for conv, f in zip(self.lateral_convs, features)]
# 自顶向下融合
for i in range(len(laterals) - 1, 0, -1):
laterals[i - 1] += F.interpolate(
laterals[i], scale_factor=2, mode='nearest'
)
# 输出特征
outs = [conv(lat) for conv, lat in zip(self.fpn_convs, laterals)]
return outs# YOLOv4/v5 的 PANet 结构
class PANet(nn.Module):
def __init__(self):
super().__init__()
# 自顶向下路径(FPN)
self.fpn_p5_to_p4 = nn.Conv2d(512, 256, 1)
self.fpn_p4_to_p3 = nn.Conv2d(256, 128, 1)
# 自底向上路径(PAN)
self.pan_p3_to_p4 = nn.Conv2d(128, 256, 3, stride=2, padding=1)
self.pan_p4_to_p5 = nn.Conv2d(256, 512, 3, stride=2, padding=1)
def forward(self, features):
p3, p4, p5 = features
# FPN 自顶向下
p5_up = F.interpolate(self.fpn_p5_to_p4(p5), scale_factor=2)
p4 = p4 + p5_up
p4_up = F.interpolate(self.fpn_p4_to_p3(p4), scale_factor=2)
p3 = p3 + p4_up
# PAN 自底向上
p3_down = self.pan_p3_to_p4(p3)
p4 = p4 + p3_down
p4_down = self.pan_p4_to_p5(p4)
p5 = p5 + p4_down
return [p3, p4, p5] # 多尺度输出增加更浅层特征(P2 检测层)
# 添加 P2 检测层配置
class YOLOv5_P2(nn.Module):
def __init__(self):
super().__init__()
# 原有的 P3, P4, P5
self.detect_p3 = DetectHead(128, num_classes=80) # stride=8
self.detect_p4 = DetectHead(256, num_classes=80) # stride=16
self.detect_p5 = DetectHead(512, num_classes=80) # stride=32
# 新增 P2 检测层
self.detect_p2 = DetectHead(64, num_classes=80) # stride=4
# P2 特征提取
self.p2_conv = nn.Sequential(
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True)
)
def forward(self, x):
# x 包含来自 backbone 的 [P2, P3, P4, P5] 特征
p2, p3, p4, p5 = x
# P2 特征增强
p2 = self.p2_conv(p2)
# 多尺度检测
outputs = []
outputs.append(self.detect_p2(p2)) # 160×160 用于极小目标
outputs.append(self.detect_p3(p3)) # 80×80 用于小目标
outputs.append(self.detect_p4(p4)) # 40×40 用于中目标
outputs.append(self.detect_p5(p5)) # 20×20 用于大目标
return outputs2. 更高分辨率输入
# 动态调整输入分辨率
class MultiScaleTraining:
def __init__(self, base_size=640, scales=[0.5, 0.75, 1.0, 1.25, 1.5]):
self.base_size = base_size
self.scales = scales
def get_size(self, epoch):
# 渐进式提高分辨率
if epoch < 50:
return 640
elif epoch < 100:
return 896
elif epoch < 150:
return 1024
else:
# 随机多尺度
scale = random.choice(self.scales)
return int(self.base_size * scale)
# 使用示例
def train_with_dynamic_resolution(model, dataloader, epochs):
scale_scheduler = MultiScaleTraining()
for epoch in range(epochs):
input_size = scale_scheduler.get_size(epoch)
# 调整数据加载器的输入尺寸
dataloader.dataset.img_size = input_size
for images, targets in dataloader:
# 根据新尺寸调整图像
images = F.interpolate(images, size=(input_size, input_size))
# 训练步骤
outputs = model(images)
loss = compute_loss(outputs, targets)
loss.backward()3. Anchor 优化(YOLOv3-v7)
import numpy as np
from sklearn.cluster import KMeans
def optimize_anchors_for_small_objects(dataset, num_anchors=9, img_size=640):
"""
针对小目标数据集优化 anchor
"""
# 收集所有边界框
boxes = []
for _, targets in dataset:
for target in targets:
w, h = target[3] * img_size, target[4] * img_size
boxes.append([w, h])
boxes = np.array(boxes)
# 过滤小目标(例如面积 < 32×32)
small_boxes = boxes[boxes[:, 0] * boxes[:, 1] < 32 * 32]
# K-means 聚类
kmeans = KMeans(n_clusters=num_anchors, random_state=0)
kmeans.fit(small_boxes)
# 获取聚类中心作为 anchor
anchors = kmeans.cluster_centers_
# 按面积排序
areas = anchors[:, 0] * anchors[:, 1]
sorted_idx = np.argsort(areas)
anchors = anchors[sorted_idx]
print("优化后的小目标 anchors:")
for i, (w, h) in enumerate(anchors):
print(f" Anchor {i+1}: {w:.1f} x {h:.1f}")
return anchors
# 使用 IoU 距离的改进版本
def kmeans_iou(boxes, num_anchors):
"""
使用 IoU 作为距离度量的 K-means
"""
def iou_distance(box, anchor):
# 计算 IoU(假设都从原点开始)
intersection = min(box[0], anchor[0]) * min(box[1], anchor[1])
union = box[0] * box[1] + anchor[0] * anchor[1] - intersection
return 1 - intersection / union
# 初始化 anchors
anchors = boxes[np.random.choice(len(boxes), num_anchors, replace=False)]
for _ in range(100): # 迭代次数
# 分配每个 box 到最近的 anchor
distances = np.array([
[iou_distance(box, anchor) for anchor in anchors]
for box in boxes
])
assignments = np.argmin(distances, axis=1)
# 更新 anchors
new_anchors = []
for i in range(num_anchors):
cluster_boxes = boxes[assignments == i]
if len(cluster_boxes) > 0:
# 使用中位数而非平均值,更鲁棒
new_anchors.append(np.median(cluster_boxes, axis=0))
else:
new_anchors.append(anchors[i])
anchors = np.array(new_anchors)
return anchors4. 改进的检测头
# Decoupled Head(解耦头)- YOLOv6/v8 采用
class DecoupledHead(nn.Module):
def __init__(self, in_channels, num_classes):
super().__init__()
# 分类分支
self.cls_conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.BatchNorm2d(in_channels),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.BatchNorm2d(in_channels),
nn.ReLU(inplace=True)
)
self.cls_pred = nn.Conv2d(in_channels, num_classes, 1)
# 回归分支
self.reg_conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.BatchNorm2d(in_channels),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.BatchNorm2d(in_channels),
nn.ReLU(inplace=True)
)
self.reg_pred = nn.Conv2d(in_channels, 4, 1) # 边界框回归
# IoU 分支(可选)
self.iou_pred = nn.Conv2d(in_channels, 1, 1) # IoU 预测
def forward(self, x):
# 分类
cls_feat = self.cls_conv(x)
cls_output = self.cls_pred(cls_feat)
# 回归
reg_feat = self.reg_conv(x)
reg_output = self.reg_pred(reg_feat)
iou_output = self.iou_pred(reg_feat)
return cls_output, reg_output, iou_output
# RepPoints 风格的可变形检测
class DeformableHead(nn.Module):
def __init__(self, in_channels, num_classes):
super().__init__()
self.offset_conv = nn.Conv2d(in_channels, 18, 3, padding=1) # 9 个采样点的偏移
self.deform_conv = DeformConv2d(in_channels, in_channels, 3, padding=1)
self.cls_conv = nn.Conv2d(in_channels, num_classes, 1)
self.reg_conv = nn.Conv2d(in_channels, 4, 1)
def forward(self, x):
# 学习采样点偏移
offsets = self.offset_conv(x)
# 可变形卷积
x = self.deform_conv(x, offsets)
# 预测
cls = self.cls_conv(x)
reg = self.reg_conv(x)
return cls, reg二、训练技巧
1. 数据增强策略
Mosaic 数据增强
class MosaicAugmentation:
def __init__(self, img_size=640, prob=1.0):
self.img_size = img_size
self.prob = prob
def __call__(self, images, targets):
if random.random() > self.prob:
return images[0], targets[0]
# 随机选择 4 张图片
if len(images) < 4:
images = images * 4
targets = targets * 4
indices = random.sample(range(len(images)), 4)
imgs = [images[i] for i in indices]
labels = [targets[i] for i in indices]
# 创建 2x2 拼接
h, w = self.img_size // 2, self.img_size // 2
# 初始化大图
mosaic_img = np.zeros((self.img_size, self.img_size, 3), dtype=np.float32)
mosaic_labels = []
# 随机中心点
cx = random.randint(w // 2, w * 3 // 2)
cy = random.randint(h // 2, h * 3 // 2)
for i, (img, label) in enumerate(zip(imgs, labels)):
# 调整图片大小
img = cv2.resize(img, (w, h))
# 确定放置位置
if i == 0: # 左上
x1, y1, x2, y2 = 0, 0, cx, cy
sx1, sy1, sx2, sy2 = w - cx, h - cy, w, h
elif i == 1: # 右上
x1, y1, x2, y2 = cx, 0, self.img_size, cy
sx1, sy1, sx2, sy2 = 0, h - cy, w - cx, h
elif i == 2: # 左下
x1, y1, x2, y2 = 0, cy, cx, self.img_size
sx1, sy1, sx2, sy2 = w - cx, 0, w, h - cy
else: # 右下
x1, y1, x2, y2 = cx, cy, self.img_size, self.img_size
sx1, sy1, sx2, sy2 = 0, 0, w - cx, h - cy
# 放置图片片段
mosaic_img[y1:y2, x1:x2] = img[sy1:sy2, sx1:sx2]
# 调整标签
for cls, x, y, w, h in label:
# 转换到 mosaic 坐标
x = (x * (sx2 - sx1) + x1) / self.img_size
y = (y * (sy2 - sy1) + y1) / self.img_size
w = w * (sx2 - sx1) / self.img_size
h = h * (sy2 - sy1) / self.img_size
# 裁剪到图像边界
x = np.clip(x, 0, 1)
y = np.clip(y, 0, 1)
w = min(w, 1 - x)
h = min(h, 1 - y)
if w > 0 and h > 0:
mosaic_labels.append([cls, x, y, w, h])
return mosaic_img, np.array(mosaic_labels)Copy-Paste 增强
class CopyPasteAugmentation:
def __init__(self, prob=0.5, max_objects=30):
self.prob = prob
self.max_objects = max_objects
def __call__(self, image, labels, paste_objects):
"""
paste_objects: 预先裁剪好的小目标库
"""
if random.random() > self.prob:
return image, labels
h, w = image.shape[:2]
new_labels = labels.copy()
# 随机选择要粘贴的小目标
n_paste = random.randint(1, min(self.max_objects, len(paste_objects)))
selected = random.sample(paste_objects, n_paste)
for obj in selected:
obj_img, obj_mask, obj_label = obj
oh, ow = obj_img.shape[:2]
# 随机位置(避免边界)
max_x = w - ow
max_y = h - oh
if max_x <= 0 or max_y <= 0:
continue
x = random.randint(0, max_x)
y = random.randint(0, max_y)
# 检查重叠(可选)
overlap = self.check_overlap(new_labels, x, y, ow, oh)
if overlap > 0.3: # 重叠阈值
continue
# 粘贴对象
image[y:y+oh, x:x+ow] = \
image[y:y+oh, x:x+ow] * (1 - obj_mask) + obj_img * obj_mask
# 添加标签
cx = (x + ow/2) / w
cy = (y + oh/2) / h
bw = ow / w
bh = oh / h
new_labels = np.vstack([new_labels, [obj_label, cx, cy, bw, bh]])
return image, new_labels
def check_overlap(self, labels, x, y, w, h):
# 检查与现有目标的重叠度
# 实现略...
return 0.02. 小目标过采样
from torch.utils.data import WeightedRandomSampler
def create_weighted_sampler(dataset, size_threshold=32*32):
"""
为包含小目标的图像增加采样权重
"""
weights = []
for idx in range(len(dataset)):
_, targets = dataset[idx]
# 计算小目标数量
small_obj_count = 0
for target in targets:
# 假设 target 格式为 [class, x, y, w, h]
area = target[3] * target[4] * dataset.img_size ** 2
if area < size_threshold:
small_obj_count += 1
# 权重与小目标数量成正比
weight = 1.0 + small_obj_count * 0.5 # 可调整系数
weights.append(weight)
# 创建采样器
sampler = WeightedRandomSampler(
weights=weights,
num_samples=len(dataset),
replacement=True
)
return sampler
# 使用示例
train_sampler = create_weighted_sampler(train_dataset)
train_loader = DataLoader(
train_dataset,
batch_size=16,
sampler=train_sampler,
num_workers=4
)3. 损失函数优化
# IoU 系列损失函数
class IoULoss(nn.Module):
def __init__(self, loss_type="ciou"):
super().__init__()
self.loss_type = loss_type
def forward(self, pred_boxes, target_boxes):
"""
pred_boxes, target_boxes: [N, 4] (x1, y1, x2, y2)
"""
# 计算交集
inter_x1 = torch.max(pred_boxes[:, 0], target_boxes[:, 0])
inter_y1 = torch.max(pred_boxes[:, 1], target_boxes[:, 1])
inter_x2 = torch.min(pred_boxes[:, 2], target_boxes[:, 2])
inter_y2 = torch.min(pred_boxes[:, 3], target_boxes[:, 3])
inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * \
torch.clamp(inter_y2 - inter_y1, min=0)
# 计算并集
pred_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * \
(pred_boxes[:, 3] - pred_boxes[:, 1])
target_area = (target_boxes[:, 2] - target_boxes[:, 0]) * \
(target_boxes[:, 3] - target_boxes[:, 1])
union_area = pred_area + target_area - inter_area
# 基础 IoU
iou = inter_area / (union_area + 1e-7)
if self.loss_type == "iou":
loss = 1 - iou
elif self.loss_type == "giou":
# 计算最小外接框
enclose_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0])
enclose_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1])
enclose_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2])
enclose_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3])
enclose_area = (enclose_x2 - enclose_x1) * (enclose_y2 - enclose_y1)
giou = iou - (enclose_area - union_area) / enclose_area
loss = 1 - giou
elif self.loss_type == "diou":
# 中心点距离
pred_cx = (pred_boxes[:, 0] + pred_boxes[:, 2]) / 2
pred_cy = (pred_boxes[:, 1] + pred_boxes[:, 3]) / 2
target_cx = (target_boxes[:, 0] + target_boxes[:, 2]) / 2
target_cy = (target_boxes[:, 1] + target_boxes[:, 3]) / 2
center_dist = (pred_cx - target_cx) ** 2 + (pred_cy - target_cy) ** 2
# 对角线距离
enclose_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0])
enclose_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1])
enclose_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2])
enclose_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3])
diagonal_dist = (enclose_x2 - enclose_x1) ** 2 + \
(enclose_y2 - enclose_y1) ** 2
diou = iou - center_dist / (diagonal_dist + 1e-7)
loss = 1 - diou
elif self.loss_type == "ciou":
# 在 DIoU 基础上添加长宽比惩罚
# 实现略...
pass
return loss.mean()
# Focal Loss 处理类别不均衡
class FocalLoss(nn.Module):
def __init__(self, alpha=0.25, gamma=2.0):
super().__init__()
self.alpha = alpha
self.gamma = gamma
def forward(self, pred, target):
"""
pred: [N, num_classes] 预测的类别概率
target: [N] 真实类别
"""
ce_loss = F.cross_entropy(pred, target, reduction='none')
p_t = torch.exp(-ce_loss)
# Focal 权重
focal_weight = (1 - p_t) ** self.gamma
# Alpha 权重(处理正负样本不均衡)
alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target)
loss = alpha_t * focal_weight * ce_loss
return loss.mean()三、推理与后处理
1. Test-Time Augmentation (TTA)
class TTA:
def __init__(self, model, scales=[0.8, 1.0, 1.2], flips=[False, True]):
self.model = model
self.scales = scales
self.flips = flips
def __call__(self, image):
predictions = []
for scale in self.scales:
for flip in self.flips:
# 缩放
scaled_img = F.interpolate(
image,
scale_factor=scale,
mode='bilinear'
)
# 翻转
if flip:
scaled_img = torch.flip(scaled_img, dims=[-1])
# 预测
with torch.no_grad():
pred = self.model(scaled_img)
# 还原预测框
if flip:
pred[..., 0] = 1 - pred[..., 0] # x 坐标翻转
pred[..., :4] /= scale # 缩放还原
predictions.append(pred)
# 合并所有预测
return self.merge_predictions(predictions)
def merge_predictions(self, predictions):
"""
使用 NMS 或投票机制合并多个预测
"""
all_boxes = []
all_scores = []
all_classes = []
for pred in predictions:
boxes, scores, classes = self.decode_prediction(pred)
all_boxes.append(boxes)
all_scores.append(scores)
all_classes.append(classes)
# 合并所有预测
all_boxes = torch.cat(all_boxes, dim=0)
all_scores = torch.cat(all_scores, dim=0)
all_classes = torch.cat(all_classes, dim=0)
# NMS
keep = self.nms(all_boxes, all_scores, iou_threshold=0.5)
return all_boxes[keep], all_scores[keep], all_classes[keep]2. 改进的 NMS
def soft_nms(boxes, scores, iou_threshold=0.5, sigma=0.5, score_threshold=0.001):
"""
Soft-NMS: 不完全抑制重叠框,而是降低其分数
"""
N = boxes.shape[0]
indexes = torch.arange(N)
# 按分数排序
scores, order = scores.sort(descending=True)
boxes = boxes[order]
keep = []
while order.numel() > 0:
if order.numel() == 1:
keep.append(order[0])
break
# 保留最高分的框
i = order[0]
keep.append(i)
# 计算 IoU
xx1 = torch.max(boxes[i, 0], boxes[order[1:], 0])
yy1 = torch.max(boxes[i, 1], boxes[order[1:], 1])
xx2 = torch.min(boxes[i, 2], boxes[order[1:], 2])
yy2 = torch.min(boxes[i, 3], boxes[order[1:], 3])
w = torch.clamp(xx2 - xx1, min=0)
h = torch.clamp(yy2 - yy1, min=0)
inter = w * h
areas_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
areas = (boxes[order[1:], 2] - boxes[order[1:], 0]) * \
(boxes[order[1:], 3] - boxes[order[1:], 1])
iou = inter / (areas_i + areas - inter)
# Soft-NMS: 根据 IoU 降低分数
scores[order[1:]] *= torch.exp(-(iou ** 2) / sigma)
# 保留分数高于阈值的框
idx = torch.where(scores[order[1:]] > score_threshold)[0]
order = order[idx + 1]
return keep
def diou_nms(boxes, scores, iou_threshold=0.5):
"""
DIoU-NMS: 考虑中心点距离的 NMS
"""
keep = []
order = scores.argsort(descending=True)
while order.numel() > 0:
i = order[0]
keep.append(i)
if order.numel() == 1:
break
# 计算 IoU
iou = box_iou(boxes[i].unsqueeze(0), boxes[order[1:]])[0]
# 计算中心点距离
center_i = (boxes[i, :2] + boxes[i, 2:]) / 2
centers = (boxes[order[1:], :2] + boxes[order[1:], 2:]) / 2
center_distance = ((center_i - centers) ** 2).sum(dim=1).sqrt()
# 计算对角线距离
enclose_mins = torch.min(boxes[i, :2], boxes[order[1:], :2])
enclose_maxs = torch.max(boxes[i, 2:], boxes[order[1:], 2:])
diagonal = ((enclose_maxs - enclose_mins) ** 2).sum(dim=1).sqrt()
# DIoU
diou = iou - (center_distance / diagonal) ** 2
# 保留 DIoU 小于阈值的框
idx = torch.where(diou < iou_threshold)[0]
order = order[idx + 1]
return keep四、工程实战技巧
1. 数据标注质量控制
class LabelQualityChecker:
def __init__(self, min_size=2, max_aspect_ratio=10):
self.min_size = min_size
self.max_aspect_ratio = max_aspect_ratio
def check_dataset(self, dataset):
"""
检查数据集标注质量
"""
issues = []
for idx, (image, labels) in enumerate(dataset):
h, w = image.shape[:2]
for label_idx, label in enumerate(labels):
cls, x, y, bw, bh = label
# 检查边界框尺寸
pixel_w = bw * w
pixel_h = bh * h
if pixel_w < self.min_size or pixel_h < self.min_size:
issues.append({
'type': 'too_small',
'image_idx': idx,
'label_idx': label_idx,
'size': (pixel_w, pixel_h)
})
# 检查长宽比
aspect_ratio = max(pixel_w / pixel_h, pixel_h / pixel_w)
if aspect_ratio > self.max_aspect_ratio:
issues.append({
'type': 'extreme_aspect_ratio',
'image_idx': idx,
'label_idx': label_idx,
'aspect_ratio': aspect_ratio
})
# 检查边界
if x - bw/2 < 0 or x + bw/2 > 1 or \
y - bh/2 < 0 or y + bh/2 > 1:
issues.append({
'type': 'out_of_bounds',
'image_idx': idx,
'label_idx': label_idx
})
return issues
# 半自动标注辅助
class SemiAutoLabeling:
def __init__(self, model, confidence_threshold=0.7):
self.model = model
self.confidence_threshold = confidence_threshold
def generate_pseudo_labels(self, unlabeled_images):
"""
使用训练好的模型生成伪标签
"""
pseudo_labels = []
for image in unlabeled_images:
with torch.no_grad():
predictions = self.model(image)
# 筛选高置信度预测
boxes, scores, classes = self.decode_predictions(predictions)
high_conf_mask = scores > self.confidence_threshold
boxes = boxes[high_conf_mask]
scores = scores[high_conf_mask]
classes = classes[high_conf_mask]
# 人工审核标记
labels = []
for box, score, cls in zip(boxes, scores, classes):
labels.append({
'box': box,
'class': cls,
'score': score,
'need_review': score < 0.9 # 标记需要审核的
})
pseudo_labels.append(labels)
return pseudo_labels2. 混合模型策略
class SuperResolutionDetector:
def __init__(self, sr_model, detection_model, sr_scale=2):
self.sr_model = sr_model # 超分辨率模型
self.detection_model = detection_model
self.sr_scale = sr_scale
def detect(self, image, use_sr_threshold=32):
"""
对小目标区域使用超分辨率
"""
# 第一次检测,获取初步结果
with torch.no_grad():
initial_detections = self.detection_model(image)
boxes, scores, classes = self.decode_detections(initial_detections)
# 识别小目标区域
small_regions = []
h, w = image.shape[-2:]
for box in boxes:
box_w = (box[2] - box[0]) * w
box_h = (box[3] - box[1]) * h
if box_w < use_sr_threshold or box_h < use_sr_threshold:
# 扩展区域(添加 padding)
x1 = max(0, int(box[0] * w - 10))
y1 = max(0, int(box[1] * h - 10))
x2 = min(w, int(box[2] * w + 10))
y2 = min(h, int(box[3] * h + 10))
small_regions.append((x1, y1, x2, y2))
# 对小目标区域进行超分辨率
enhanced_detections = []
for x1, y1, x2, y2 in small_regions:
# 裁剪区域
region = image[:, :, y1:y2, x1:x2]
# 超分辨率
with torch.no_grad():
sr_region = self.sr_model(region)
# 在增强的区域上重新检测
with torch.no_grad():
region_detections = self.detection_model(sr_region)
# 将检测结果映射回原图坐标
region_boxes, region_scores, region_classes = \
self.decode_detections(region_detections)
# 坐标转换
scale = 1 / self.sr_scale
for box in region_boxes:
box[0] = (box[0] * (x2 - x1) * scale + x1) / w
box[1] = (box[1] * (y2 - y1) * scale + y1) / h
box[2] = (box[2] * (x2 - x1) * scale + x1) / w
box[3] = (box[3] * (y2 - y1) * scale + y1) / h
enhanced_detections.append((region_boxes, region_scores, region_classes))
# 合并所有检测结果
return self.merge_detections(initial_detections, enhanced_detections)3. 轻量化分支设计
class LightweightSmallObjectBranch(nn.Module):
def __init__(self, in_channels=64):
super().__init__()
# 轻量级卷积(深度可分离)
self.dw_conv = nn.Conv2d(
in_channels, in_channels, 3,
padding=1, groups=in_channels
)
self.pw_conv = nn.Conv2d(in_channels, in_channels // 2, 1)
# 注意力模块
self.attention = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(in_channels // 2, in_channels // 8, 1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels // 8, in_channels // 2, 1),
nn.Sigmoid()
)
# 检测头
self.detect = nn.Conv2d(in_channels // 2, 5, 1) # 4 bbox + 1 obj
def forward(self, x):
# 深度可分离卷积
x = self.dw_conv(x)
x = self.pw_conv(x)
# 通道注意力
att = self.attention(x)
x = x * att
# 检测输出
return self.detect(x)
# 整合到主模型
class YOLOWithSmallObjectBranch(nn.Module):
def __init__(self, base_model):
super().__init__()
self.base_model = base_model
# 在浅层特征上添加小目标分支
self.small_obj_branch = LightweightSmallObjectBranch(64)
def forward(self, x):
# 提取多尺度特征
features = self.base_model.extract_features(x)
# 浅层特征用于小目标检测
small_obj_pred = self.small_obj_branch(features[0]) # 最浅层
# 常规检测
regular_pred = self.base_model.detect(features)
return {
'small': small_obj_pred,
'regular': regular_pred
}五、各版本 YOLO 的特定优化
YOLOv3-v5 特定配置
# yolov5s-small-objects.yaml
nc: 80 # 类别数
depth_multiple: 0.33
width_multiple: 0.50
# anchors - 针对小目标优化
anchors:
- [4,5, 6,8, 9,12] # P2/4 - 新增,极小目标
- [10,13, 16,30, 33,23] # P3/8 - 小目标
- [30,61, 62,45, 59,119] # P4/16 - 中目标
- [116,90, 156,198, 373,326] # P5/32 - 大目标
# backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4 - 保留用于检测
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]]] # 9
# head - 包含 P2 检测层
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [128, 1, 1]], # 新增 P2 上采样
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 2], 1, Concat, [1]], # cat backbone P2
[-1, 3, C3, [128, False]], # 21 (P2/4-xsmall)
[-1, 1, Conv, [128, 3, 2]],
[[-1, 18], 1, Concat, [1]], # cat head P3
[-1, 3, C3, [256, False]], # 24 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 27 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 30 (P5/32-large)
[[21, 24, 27, 30], 1, Detect, [nc, anchors]]] # 检测层包含 P2YOLOv8 特定优化
# YOLOv8 小目标优化配置
def create_yolov8_small_object_config():
return {
# 模型配置
'model': {
'nc': 80,
'scales': {
'n': [0.33, 0.25], # depth, width
's': [0.33, 0.50],
'm': [0.67, 0.75],
'l': [1.00, 1.00],
'x': [1.00, 1.25],
}
},
# 训练配置
'train': {
'imgsz': 1280, # 更高分辨率
'batch': 16,
'epochs': 300,
'patience': 50,
'optimizer': 'AdamW',
'lr0': 0.001,
'lrf': 0.01,
'momentum': 0.937,
'weight_decay': 0.0005,
'warmup_epochs': 3.0,
'warmup_momentum': 0.8,
'warmup_bias_lr': 0.1,
# 损失权重 - 提高边界框损失权重
'box': 7.5, # 默认 7.5
'cls': 0.5, # 默认 0.5
'dfl': 1.5, # 默认 1.5
# 数据增强
'hsv_h': 0.015,
'hsv_s': 0.7,
'hsv_v': 0.4,
'degrees': 0.0, # 小目标避免过度旋转
'translate': 0.1,
'scale': 0.5,
'shear': 0.0,
'perspective': 0.0,
'flipud': 0.0,
'fliplr': 0.5,
'mosaic': 1.0,
'mixup': 0.1,
'copy_paste': 0.3, # Copy-Paste 增强
},
# 验证配置
'val': {
'imgsz': 1280,
'conf': 0.001, # 降低置信度阈值
'iou': 0.6,
'max_det': 300,
}
}六、性能评估与调试
1. 小目标专用评估指标
def evaluate_small_objects(predictions, ground_truths, size_ranges):
"""
分尺寸评估检测性能
size_ranges: [(0, 32), (32, 96), (96, float('inf'))]
"""
results = {}
for size_min, size_max in size_ranges:
# 筛选该尺寸范围的真实框
size_gts = []
for gt in ground_truths:
area = gt['width'] * gt['height']
if size_min <= area < size_max:
size_gts.append(gt)
# 计算该尺寸范围的 AP
if len(size_gts) > 0:
ap = calculate_ap(predictions, size_gts)
results[f'{size_min}-{size_max}'] = ap
else:
results[f'{size_min}-{size_max}'] = 0.0
return results
# COCO 风格的评估
class COCOEvaluator:
def __init__(self):
self.area_ranges = [
(0, 32**2), # small
(32**2, 96**2), # medium
(96**2, 1e5**2) # large
]
def evaluate(self, dataset, model):
# 收集所有预测和真实值
all_predictions = []
all_ground_truths = []
for image, gt in dataset:
pred = model(image)
all_predictions.extend(pred)
all_ground_truths.extend(gt)
# 计算各项指标
metrics = {
'mAP': self.calculate_map(all_predictions, all_ground_truths),
'mAP_50': self.calculate_map(all_predictions, all_ground_truths, iou_thresh=0.5),
'mAP_75': self.calculate_map(all_predictions, all_ground_truths, iou_thresh=0.75),
}
# 分尺寸评估
for i, (min_area, max_area) in enumerate(self.area_ranges):
area_name = ['small', 'medium', 'large'][i]
metrics[f'mAP_{area_name}'] = self.calculate_map_by_area(
all_predictions, all_ground_truths, min_area, max_area
)
return metrics2. 可视化调试工具
import cv2
import matplotlib.pyplot as plt
def visualize_small_object_detections(image, predictions, ground_truths,
size_threshold=32):
"""
可视化小目标检测结果
"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 原图 + 所有检测
ax1 = axes[0]
ax1.imshow(image)
ax1.set_title('All Detections')
for pred in predictions:
rect = plt.Rectangle(
(pred['x'], pred['y']), pred['w'], pred['h'],
fill=False, color='red', linewidth=1
)
ax1.add_patch(rect)
# 小目标检测
ax2 = axes[1]
ax2.imshow(image)
ax2.set_title(f'Small Objects (<{size_threshold}px)')
for pred in predictions:
if pred['w'] < size_threshold or pred['h'] < size_threshold:
rect = plt.Rectangle(
(pred['x'], pred['y']), pred['w'], pred['h'],
fill=False, color='yellow', linewidth=2
)
ax2.add_patch(rect)
# 漏检分析
ax3 = axes[2]
ax3.imshow(image)
ax3.set_title('Missed Detections')
for gt in ground_truths:
matched = False
for pred in predictions:
iou = calculate_iou(gt, pred)
if iou > 0.5:
matched = True
break
if not matched:
rect = plt.Rectangle(
(gt['x'], gt['y']), gt['w'], gt['h'],
fill=False, color='blue', linewidth=2
)
ax3.add_patch(rect)
plt.tight_layout()
plt.show()
# 热力图分析
def generate_detection_heatmap(model, image, stride=8):
"""
生成检测响应热力图
"""
h, w = image.shape[:2]
heatmap = np.zeros((h // stride, w // stride))
# 滑窗检测
for y in range(0, h - stride, stride):
for x in range(0, w - stride, stride):
patch = image[y:y+stride*2, x:x+stride*2]
with torch.no_grad():
response = model.get_objectness(patch)
heatmap[y//stride, x//stride] = response.max()
# 上采样到原图大小
heatmap = cv2.resize(heatmap, (w, h))
# 叠加显示
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(image)
plt.title('Original Image')
plt.subplot(1, 2, 2)
plt.imshow(image)
plt.imshow(heatmap, alpha=0.5, cmap='jet')
plt.colorbar()
plt.title('Detection Heatmap')
plt.show()七、最佳实践总结
场景化配置推荐
| 场景 | 推荐配置 |
|---|---|
| 医疗细菌检测 | YOLOv8 + P2层 + 1280输入 + Copy-Paste + Soft-NMS |
| 无人机小目标 | YOLOv5 + 自定义anchor + TTA + 超分辨率 |
| PCB缺陷检测 | YOLOv8 + Decoupled Head + 多尺度训练 + DIoU-NMS |
| 卫星遥感 | YOLOv10 + 切片检测 + 全局-局部融合 |
调优优先级
-
数据层面(效果最明显)
- 提高标注质量
- 增加小目标样本
- Copy-Paste 增强
-
模型结构(次优先)
- 添加 P2 检测层
- 使用 PANet/BiFPN
- Decoupled Head
-
训练策略
- 多尺度训练
- Mosaic 增强
- 适当的损失权重
-
后处理
- Soft-NMS
- TTA
- 置信度阈值调整
常见问题与解决
-
漏检率高
- 降低置信度阈值
- 增加输入分辨率
- 使用 TTA
-
误检率高
- 提高 NMS 阈值
- 加强负样本训练
- 使用 Focal Loss
-
速度慢
- 使用轻量级分支
- 减少 TTA scales
- 模型剪枝/量化
-
显存不足
- 减小 batch size
- 使用梯度累积
- 混合精度训练
通过合理组合这些优化策略,YOLO 系列可以在小目标检测任务上达到优秀的性能。关键是根据具体场景选择合适的优化组合,并通过实验不断调整。
Last updated on