xhs_factory/services/topic_engine.py
zhoujie 1ec520b47e feat(content): 新增智能选题引擎、批量创作和图文协同优化
- 新增智能选题引擎 `TopicEngine`,整合热点数据与历史权重,提供多维度评分和创作角度建议
- 新增内容模板系统 `ContentTemplate`,支持从 JSON 文件加载模板并应用于文案生成
- 新增批量创作功能 `batch_generate_copy`,支持串行生成多篇文案并自动入草稿队列
- 升级文案质量流水线:实现 Prompt 分层架构(基础层 + 风格层 + 人设层)、LLM 自检与改写机制、深度去 AI 化后处理
- 优化图文协同:新增封面图策略选择、SD prompt 与文案语义联动、图文匹配度评估
- 集成数据闭环:在文案生成中自动注入 `AnalyticsService` 权重数据,实现发布 → 数据回收 → 优化创作的完整循环
- 更新 UI 组件:新增选题推荐展示区、批量创作折叠面板、封面图策略选择器和图文匹配度评分展示

♻️ refactor(llm): 重构 Prompt 架构并增强去 AI 化处理

- 将 `PROMPT_COPYWRITING` 拆分为分层架构(基础层 + 风格层 + 人设层),提高维护性和灵活性
- 增强 `_humanize_content` 方法:新增语气词注入、标点不规范化、段落节奏打散和 emoji 密度控制
- 新增 `_self_check` 和 `_self_check_rewrite` 方法,实现文案 AI 痕迹自检与自动改写
- 新增 `evaluate_image_text_match` 方法,支持文案与 SD prompt 的语义匹配度评估(可选,失败不阻塞)
- 新增封面图策略配置 `COVER_STRATEGIES` 和情感基调映射 `EMOTION_SD_MAP`

📝 docs(openspec): 归档内容创作优化提案和详细规格

- 新增 `openspec/changes/archive/2026-02-28-optimize-content-creation/` 目录,包含设计文档、提案、规格说明和任务清单
- 新增 `openspec/specs/` 下的批量创作、文案质量流水线、图文协同、服务内容和智能选题引擎规格文档
- 更新 `openspec/specs/services-content/spec.md`,反映新增的批量创作和智能选题入口函数

🔧 chore(config): 更新服务配置和 UI 集成

- 在 `services/content.py` 中集成权重数据自动注入逻辑,实现数据驱动创作
- 在 `ui/app.py` 中新增选题推荐、批量生成和图文匹配度评估的回调函数
- 在 `ui/tab_create.py` 中新增智能选题推荐区、批量创作面板和图文匹配度评估组件
- 修复 `services/sd_service.py` 中的头像文件路径问题,确保目录存在
2026-02-28 21:04:09 +08:00

463 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
services/topic_engine.py
智能选题引擎 — 聚合热点数据 + 历史权重,推荐高潜力选题
"""
import logging
import os
import json
import re
from datetime import datetime, timedelta
logger = logging.getLogger("autobot")
class TopicEngine:
"""
智能选题推荐引擎
职责: 聚合热点探测结果与历史互动权重,为用户推荐高潜力选题。
不直接访问 MCP / LLM通过注入的 AnalyticsService 获取数据。
"""
def __init__(self, analytics_service):
"""
Args:
analytics_service: AnalyticsService 实例,提供权重和笔记数据
"""
self.analytics = analytics_service
# ========== 核心: 多维度评分 ==========
def score_topic(self, topic: str, hotspot_data: dict = None) -> dict:
"""
为单个候选主题计算综合评分
维度:
- hotspot_score (0-40): 热点热度
- weight_score (0-30): 历史互动权重
- scarcity_score(0-20): 内容稀缺度
- timeliness_score(0-10): 时效性
Args:
topic: 候选主题文本
hotspot_data: 可选的热点分析数据(包含 hot_topics, suggestions 等)
Returns:
dict with total_score, hotspot_score, weight_score, scarcity_score, timeliness_score
"""
hotspot_score = self._calc_hotspot_score(topic, hotspot_data)
weight_score = self._calc_weight_score(topic)
scarcity_score = self._calc_scarcity_score(topic)
timeliness_score = self._calc_timeliness_score(topic)
total = hotspot_score + weight_score + scarcity_score + timeliness_score
return {
"total_score": total,
"hotspot_score": hotspot_score,
"weight_score": weight_score,
"scarcity_score": scarcity_score,
"timeliness_score": timeliness_score,
}
# ========== 推荐主题列表 ==========
def recommend_topics(self, count: int = 5, hotspot_data: dict = None) -> list[dict]:
"""
推荐排序后的选题列表
逻辑:
1. 收集候选主题 (热点 + 权重主题)
2. 对每个主题评分
3. 去重 (语义相近合并)
4. 按总分降序取 top-N
5. 为每个主题生成创作角度建议
Args:
count: 返回推荐数量 (默认 5)
hotspot_data: 可选的热点分析数据
Returns:
list of dict, 每项包含:
topic, score, reason, source, angles,
score_detail (各维度分数)
"""
candidates = self._collect_candidates(hotspot_data)
if not candidates:
logger.warning("选题引擎: 无候选主题可推荐")
return []
# 评分
scored = []
for topic, source in candidates:
detail = self.score_topic(topic, hotspot_data)
scored.append({
"topic": topic,
"score": detail["total_score"],
"source": source,
"score_detail": detail,
})
# 去重
scored = self._deduplicate(scored)
# 排序
scored.sort(key=lambda x: x["score"], reverse=True)
scored = scored[:count]
# 生成 reason 和 angles
for item in scored:
item["reason"] = self._generate_reason(item)
item["angles"] = self._generate_angles(item["topic"], item["source"])
return scored
# ========== 候选收集 ==========
def _collect_candidates(self, hotspot_data: dict = None) -> list[tuple[str, str]]:
"""
收集所有候选主题,返回 [(topic, source), ...]
source: "hotspot" | "weight" | "trend"
"""
candidates = []
seen = set()
# 1. 从热点数据收集
if hotspot_data:
for topic in hotspot_data.get("hot_topics", []):
topic_clean = self._clean_topic(topic)
if topic_clean and topic_clean not in seen:
candidates.append((topic_clean, "hotspot"))
seen.add(topic_clean)
for suggestion in hotspot_data.get("suggestions", []):
topic_clean = self._clean_topic(suggestion.get("topic", ""))
if topic_clean and topic_clean not in seen:
candidates.append((topic_clean, "hotspot"))
seen.add(topic_clean)
# 2. 从权重数据收集
topic_weights = self.analytics._weights.get("topic_weights", {})
for topic, info in topic_weights.items():
topic_clean = self._clean_topic(topic)
if topic_clean and topic_clean not in seen:
candidates.append((topic_clean, "weight"))
seen.add(topic_clean)
# 3. 从分析历史提取趋势主题
history = self.analytics._weights.get("analysis_history", [])
for entry in history[-5:]:
top_topic = entry.get("top_topic", "")
if top_topic and top_topic not in seen:
candidates.append((top_topic, "trend"))
seen.add(top_topic)
return candidates
# ========== 评分子模块 ==========
def _calc_hotspot_score(self, topic: str, hotspot_data: dict = None) -> int:
"""热点热度评分 (0-40)"""
if not hotspot_data:
return 0
score = 0
# 检查是否在热门主题中
hot_topics = hotspot_data.get("hot_topics", [])
for i, ht in enumerate(hot_topics):
if self._topic_similar(topic, ht):
# 排名越靠前分越高
score = max(score, 40 - i * 5)
break
# 检查是否在推荐建议中
suggestions = hotspot_data.get("suggestions", [])
for suggestion in suggestions:
if self._topic_similar(topic, suggestion.get("topic", "")):
score = max(score, 30)
break
return min(40, score)
def _calc_weight_score(self, topic: str) -> int:
"""历史互动权重评分 (0-30)"""
topic_weights = self.analytics._weights.get("topic_weights", {})
if not topic_weights:
return 0
# 精确匹配
if topic in topic_weights:
weight = topic_weights[topic].get("weight", 0)
# weight 原始范围 0-100映射到 0-30
return min(30, int(weight * 0.3))
# 模糊匹配
best_score = 0
for existing_topic, info in topic_weights.items():
if self._topic_similar(topic, existing_topic):
weight = info.get("weight", 0)
best_score = max(best_score, min(30, int(weight * 0.3)))
return best_score
def _calc_scarcity_score(self, topic: str) -> int:
"""
内容稀缺度评分 (0-20)
近 7 天已发布 >= 2 篇的主题: scarcity_score <= 5
"""
notes = self.analytics._analytics_data.get("notes", {})
seven_days_ago = (datetime.now() - timedelta(days=7)).isoformat()
recent_count = 0
for nid, note in notes.items():
collected = note.get("collected_at", "")
if collected >= seven_days_ago:
note_topic = note.get("topic", "")
if self._topic_similar(topic, note_topic):
recent_count += 1
if recent_count >= 2:
return min(5, max(0, 5 - recent_count)) # 发的越多越低
elif recent_count == 1:
return 12 # 有一篇,中等稀缺
else:
return 20 # 完全空白,高稀缺
def _calc_timeliness_score(self, topic: str) -> int:
"""
时效性评分 (0-10)
基于主题是否包含时效性关键词(季节、节日等)
"""
now = datetime.now()
month = now.month
# 季节关键词
season_keywords = {
"": [2, 3, 4, 5],
"": [5, 6, 7, 8],
"": [8, 9, 10, 11],
"": [11, 12, 1, 2],
"早春": [2, 3],
"初夏": [5, 6],
"初秋": [8, 9],
}
# 节日关键词
festival_windows = {
"情人节": (2, 10, 2, 18),
"三八": (3, 1, 3, 12),
"妇女节": (3, 1, 3, 12),
"母亲节": (5, 5, 5, 15),
"618": (6, 1, 6, 20),
"七夕": (7, 20, 8, 15),
"中秋": (9, 1, 9, 30),
"国庆": (9, 25, 10, 10),
"双十一": (10, 20, 11, 15),
"双11": (10, 20, 11, 15),
"双十二": (12, 1, 12, 15),
"圣诞": (12, 15, 12, 28),
"元旦": (12, 25, 1, 5),
"年货": (1, 5, 2, 10),
"春节": (1, 10, 2, 10),
"开学": (8, 20, 9, 15),
}
score = 5 # 基础分
# 季节匹配
for keyword, months in season_keywords.items():
if keyword in topic and month in months:
score = max(score, 8)
break
# 节日窗口匹配
for keyword, (m1, d1, m2, d2) in festival_windows.items():
if keyword in topic:
start = datetime(now.year, m1, d1)
end = datetime(now.year, m2, d2)
# 处理跨年
if start > end:
if now >= start or now <= end:
score = 10
break
elif start <= now <= end:
score = 10
break
else:
score = max(score, 3) # 不在窗口期但有时效关键词
return score
# ========== 去重 ==========
def _deduplicate(self, scored: list[dict]) -> list[dict]:
"""
去重: 语义相近的主题合并,保留分数较高者
例: "春季穿搭""早春穿搭" 合并为高分项
"""
if len(scored) <= 1:
return scored
result = []
merged_indices = set()
for i in range(len(scored)):
if i in merged_indices:
continue
best = scored[i]
for j in range(i + 1, len(scored)):
if j in merged_indices:
continue
if self._topic_similar(scored[i]["topic"], scored[j]["topic"]):
merged_indices.add(j)
if scored[j]["score"] > best["score"]:
best = scored[j]
result.append(best)
return result
# ========== 辅助方法 ==========
@staticmethod
def _clean_topic(topic: str) -> str:
"""清理主题文本"""
if not topic:
return ""
# 去除序号、emoji、多余空格
t = re.sub(r'^[\d.、)\]】]+\s*', '', topic.strip())
t = re.sub(r'[•·●]', '', t)
return t.strip()
@staticmethod
def _topic_similar(a: str, b: str) -> bool:
"""
判断两个主题是否语义相近 (简单规则匹配)
策略:
1. 完全相同 → True
2. 一方包含另一方 → True
3. 去除修饰词后相同 → True
4. 共享核心词比例 > 60% → True
"""
if not a or not b:
return False
a_clean = a.strip().lower()
b_clean = b.strip().lower()
# 完全相同
if a_clean == b_clean:
return True
# 包含关系
if a_clean in b_clean or b_clean in a_clean:
return True
# 去修饰词
modifiers = ["", "", "", "", "", "", "", "真的", "必看"]
a_core = a_clean
b_core = b_clean
for mod in modifiers:
a_core = a_core.replace(mod, "")
b_core = b_core.replace(mod, "")
if a_core and b_core and a_core == b_core:
return True
# 核心词重叠
# 按字分词 (中文简单分词)
a_chars = set(a_clean)
b_chars = set(b_clean)
if len(a_chars) >= 2 and len(b_chars) >= 2:
intersection = a_chars & b_chars
union = a_chars | b_chars
if len(intersection) / len(union) > 0.6:
return True
return False
@staticmethod
def _generate_reason(item: dict) -> str:
"""根据评分生成推荐理由"""
detail = item.get("score_detail", {})
parts = []
if detail.get("hotspot_score", 0) >= 25:
parts.append("当前热点话题")
if detail.get("weight_score", 0) >= 15:
parts.append("历史互动表现好")
if detail.get("scarcity_score", 0) >= 15:
parts.append("内容空白可抢占")
if detail.get("timeliness_score", 0) >= 8:
parts.append("时效性强")
source = item.get("source", "")
if source == "hotspot" and not parts:
parts.append("热点趋势推荐")
elif source == "weight" and not parts:
parts.append("基于历史表现推荐")
elif source == "trend" and not parts:
parts.append("持续趋势主题")
if not parts:
parts.append("综合推荐")
return "".join(parts)
@staticmethod
def _generate_angles(topic: str, source: str) -> list[str]:
"""
为主题生成 1-3 个创作角度建议
注意: 这里用规则生成,不调用 LLM
"""
angles = []
# 通用角度模板
templates_by_type = {
"穿搭": [
f"从预算角度分享{topic}的平替选择",
f"身材不同如何驾驭{topic}",
f"一周{topic}不重样的实穿记录",
],
"美食": [
f"零失败的{topic}详细做法",
f"外卖 vs 自己做{topic}的对比",
f"{topic}的隐藏吃法",
],
"护肤": [
f"不同肤质的{topic}选择指南",
f"踩雷vs回购{topic}真实体验",
f"平价替代大牌{topic}推荐",
],
"好物": [
f"用了半年的{topic}真实测评",
f"后悔没早买的{topic}清单",
f"从使用场景出发推荐{topic}",
],
}
# 根据主题关键词匹配模板
matched = False
for keyword, templates in templates_by_type.items():
if keyword in topic:
angles = templates[:3]
matched = True
break
if not matched:
# 通用角度
angles = [
f"个人真实体验分享{topic}",
f"新手入门{topic}的详细攻略",
f"关于{topic}的冷知识和避坑指南",
]
# 限制每个角度不超过 30 字
return [a[:30] for a in angles]