2026-05-13 14:42:45 +08:00
#!/bin/bash
# run_daily.sh - 每日数据采集与报告生成流水线
# Sprint 3: 完整调度脚本(采集→质量检查→报告生成→归档→通知)
set -euo pipefail
PROJECT_DIR = "/home/long/project/llm-intelligence"
2026-05-13 20:13:02 +08:00
. " $PROJECT_DIR /scripts/report_utils.sh "
2026-05-14 16:17:39 +08:00
if [ [ -f " $PROJECT_DIR /.env.local " ] ] ; then
# shellcheck disable=SC1091
source " $PROJECT_DIR /.env.local "
fi
if [ [ -f " $PROJECT_DIR /.env " ] ] ; then
# shellcheck disable=SC1091
source " $PROJECT_DIR /.env "
fi
2026-05-13 14:42:45 +08:00
DB_URL = " ${ DATABASE_URL :- host =/var/run/postgresql dbname=llm_intelligence user=long sslmode=disable } "
2026-05-13 20:13:02 +08:00
REPORT_DATE = " $( report_date_value) "
2026-05-13 14:42:45 +08:00
LOG_FILE = " /tmp/llm_hub_daily_ ${ REPORT_DATE } .log "
FEISHU_WEBHOOK = " ${ FEISHU_WEBHOOK :- } "
2026-05-13 20:13:02 +08:00
MODEL_COUNT = ""
2026-05-14 16:17:39 +08:00
FETCH_OUT = " ${ PROJECT_DIR } /models.json "
FETCH_TOTAL = "0"
PIPELINE_STAGE_SET = "openrouter,multi_source,official_imports,daily_report"
PIPELINE_SOURCE_SET = "openrouter,moonshot,deepseek,openai,zhipu,baidu,bytedance"
PIPELINE_FAILED_SOURCE_SET = "none"
MULTI_SOURCE_AUDIT = "multi_source_audit=unavailable"
PIPELINE_AUDIT_SUMMARY = ""
2026-05-13 14:42:45 +08:00
# 日志函数
log( ) {
echo " [ $( date '+%Y-%m-%d %H:%M:%S' ) ] $1 " | tee -a " $LOG_FILE "
}
2026-05-14 16:17:39 +08:00
normalize_summary_file( ) {
local path = " $1 "
if [ ! -f " $path " ] ; then
return
fi
tr '\n' ' ' < " $path " | sed 's/[[:space:]]\+/ /g; s/^ //; s/ $//'
}
extract_failed_source_keys( ) {
local summary = " $1 "
printf '%s\n' " $summary " | sed -n 's/.*failed_source_keys=\([^ ]*\).*/\1/p'
}
merge_failed_source_keys( ) {
local keys = " $1 "
if [ -z " $keys " ] || [ " $keys " = "none" ] ; then
return
fi
if [ " $PIPELINE_FAILED_SOURCE_SET " = "none" ] ; then
PIPELINE_FAILED_SOURCE_SET = " $keys "
return
fi
PIPELINE_FAILED_SOURCE_SET = " ${ PIPELINE_FAILED_SOURCE_SET } , ${ keys } "
}
refresh_pipeline_audit( ) {
PIPELINE_AUDIT_SUMMARY = " runtime_audit stage_set= ${ PIPELINE_STAGE_SET } selected_source_keys= ${ PIPELINE_SOURCE_SET } failed_source_keys= ${ PIPELINE_FAILED_SOURCE_SET } openrouter_total= ${ FETCH_TOTAL :- 0 } ${ MULTI_SOURCE_AUDIT } "
}
2026-05-13 14:42:45 +08:00
# 错误处理
error_exit( ) {
2026-05-13 20:13:02 +08:00
local output_path = ""
2026-05-13 14:42:45 +08:00
log " ❌ 错误: $1 "
2026-05-14 16:17:39 +08:00
refresh_pipeline_audit
2026-05-13 14:42:45 +08:00
# 降级:复制昨日报告
fallback_report
2026-05-13 20:13:02 +08:00
if [ -f " $( report_markdown_path " $REPORT_DATE " ) " ] ; then
output_path = " $( report_markdown_path " $REPORT_DATE " ) "
fi
2026-05-14 16:17:39 +08:00
track_report_state " $DB_URL " " $REPORT_DATE " "failed" " ${ MODEL_COUNT :- } " " $PIPELINE_AUDIT_SUMMARY " " $output_path " " $1 " "scheduled" "cron" "true" >> " $LOG_FILE " 2>& 1 || true
2026-05-13 14:42:45 +08:00
# 发送告警
if [ -n " $FEISHU_WEBHOOK " ] ; then
send_alert " $1 "
fi
exit 1
}
2026-05-14 16:17:39 +08:00
refresh_pipeline_audit
2026-05-13 14:42:45 +08:00
# 降级:复制昨日报告
fallback_report( ) {
2026-05-13 20:13:02 +08:00
local yesterday yesterday_md today_md yesterday_html today_html
yesterday = $( date -d "yesterday" +%Y-%m-%d)
yesterday_md = " ${ PROJECT_DIR } / $( report_markdown_path " $yesterday " ) "
today_md = " ${ PROJECT_DIR } / $( report_markdown_path " $REPORT_DATE " ) "
yesterday_html = " ${ PROJECT_DIR } / $( report_html_path " $yesterday " ) "
today_html = " ${ PROJECT_DIR } / $( report_html_path " $REPORT_DATE " ) "
2026-05-13 14:42:45 +08:00
if [ -f " $yesterday_md " ] ; then
cp " $yesterday_md " " $today_md "
sed -i " s/ ${ yesterday } / ${ REPORT_DATE } /g " " $today_md "
sed -i "1s/^/# [数据延迟] /" " $today_md "
2026-05-13 20:13:02 +08:00
if [ -f " $yesterday_html " ] ; then
cp " $yesterday_html " " $today_html "
sed -i " s/ ${ yesterday } / ${ REPORT_DATE } /g " " $today_html "
fi
if [ -f " $today_md " ] && [ -f " $today_html " ] ; then
archive_report_artifacts " $REPORT_DATE " >> " $LOG_FILE " 2>& 1 || true
fi
2026-05-13 14:42:45 +08:00
log "⚠️ 已复制昨日报告并标记[数据延迟]"
else
log "⚠️ 无昨日报告可供复制"
fi
}
# 发送飞书告警
send_alert( ) {
local msg = " $1 "
local payload = " {\"msg_type\":\"text\",\"content\":{\"text\":\"🚨 LLM Hub 日报失败\\n日期: ${ REPORT_DATE } \\n错误: ${ msg } \\n请检查日志: ${ LOG_FILE } \"}} "
curl -s -X POST -H "Content-Type: application/json" \
-d " $payload " \
" $FEISHU_WEBHOOK " > /dev/null || true
log "📢 飞书告警已发送"
}
# 主流程
log " 🚀 开始每日流水线: ${ REPORT_DATE } "
cd " $PROJECT_DIR "
# 1. 数据采集
log "1️ ⃣ 数据采集..."
2026-05-14 16:17:39 +08:00
if ! go run scripts/fetch_openrouter.go -strict-real -out " $FETCH_OUT " >> " $LOG_FILE " 2>& 1; then
merge_failed_source_keys "openrouter"
2026-05-13 14:42:45 +08:00
error_exit "数据采集失败"
fi
2026-05-14 16:17:39 +08:00
FETCH_TOTAL = $( python3 - <<'PY' " $FETCH_OUT "
import json, sys
path = sys.argv[ 1]
with open( path, 'r' , encoding = 'utf-8' ) as f:
data = json.load( f)
print( int( data.get( "total" , 0) ) )
PY
)
if [ " ${ FETCH_TOTAL :- 0 } " -lt 10 ] ; then
merge_failed_source_keys "openrouter"
error_exit " 本次采集结果异常: total= ${ FETCH_TOTAL :- 0 } < 10 "
fi
refresh_pipeline_audit
2026-05-13 14:42:45 +08:00
log "✅ 数据采集完成"
2026-05-14 16:17:39 +08:00
# 1.5 多源补充同步
log "1️ ⃣➕ 多源补充同步..."
MULTI_SOURCE_OUTPUT = " $( mktemp) "
if ! go run scripts/fetch_multi_source.go --sources moonshot,deepseek,openai > " $MULTI_SOURCE_OUTPUT " 2>> " $LOG_FILE " ; then
MULTI_SOURCE_SUMMARY = " $( normalize_summary_file " $MULTI_SOURCE_OUTPUT " ) "
if [ -n " $MULTI_SOURCE_SUMMARY " ] ; then
MULTI_SOURCE_AUDIT = " multi_source_audit= ${ MULTI_SOURCE_SUMMARY } "
merge_failed_source_keys " $( extract_failed_source_keys " $MULTI_SOURCE_SUMMARY " ) "
else
MULTI_SOURCE_AUDIT = "multi_source_audit=stage_failed"
merge_failed_source_keys "moonshot,deepseek,openai"
fi
cat " $MULTI_SOURCE_OUTPUT " >> " $LOG_FILE "
rm -f " $MULTI_SOURCE_OUTPUT "
error_exit "多源补充同步失败"
fi
MULTI_SOURCE_SUMMARY = " $( normalize_summary_file " $MULTI_SOURCE_OUTPUT " ) "
MULTI_SOURCE_AUDIT = " multi_source_audit= ${ MULTI_SOURCE_SUMMARY :- none } "
merge_failed_source_keys " $( extract_failed_source_keys " $MULTI_SOURCE_SUMMARY " ) "
refresh_pipeline_audit
cat " $MULTI_SOURCE_OUTPUT " >> " $LOG_FILE "
rm -f " $MULTI_SOURCE_OUTPUT "
if ! go run -tags llm_script scripts/import_zhipu_data.go >> " $LOG_FILE " 2>& 1; then
merge_failed_source_keys "zhipu"
error_exit "智谱官方导入失败"
fi
if ! go run -tags llm_script scripts/export_official_seed_json.go >> " $LOG_FILE " 2>& 1; then
merge_failed_source_keys "official_seed_export"
error_exit "官方种子导出失败"
fi
if ! go run -tags llm_script scripts/import_phase2_data.go >> " $LOG_FILE " 2>& 1; then
merge_failed_source_keys "baidu"
error_exit "百度官方导入失败"
fi
if ! go run -tags llm_script scripts/import_bytedance_data.go >> " $LOG_FILE " 2>& 1; then
merge_failed_source_keys "bytedance"
error_exit "字节官方导入失败"
fi
refresh_pipeline_audit
log "✅ 多源补充同步完成"
2026-05-13 14:42:45 +08:00
# 2. 数据质量检查
log "2️ ⃣ 数据质量检查..."
MODEL_COUNT = $( psql " $DB_URL " -t -c "SELECT COUNT(*) FROM models WHERE deleted_at IS NULL" 2>/dev/null | tr -d ' ' )
if [ " $MODEL_COUNT " -lt 10 ] ; then
error_exit " 模型数量不足: ${ MODEL_COUNT } < 10 "
fi
log " ✅ 数据质量检查通过 (模型数: ${ MODEL_COUNT } ) "
# 3. 生成日报
log "3️ ⃣ 生成日报..."
export DATABASE_URL = " $DB_URL "
2026-05-14 16:17:39 +08:00
if ! REPORT_RUN_KIND = "scheduled" REPORT_TRIGGER_SOURCE = "cron" REPORT_IS_OFFICIAL_DAILY = "true" REPORT_RUNTIME_AUDIT = " $PIPELINE_AUDIT_SUMMARY " go run scripts/generate_daily_report.go >> " $LOG_FILE " 2>& 1; then
2026-05-13 14:42:45 +08:00
error_exit "日报生成失败"
fi
log "✅ 日报生成完成"
2026-05-13 20:13:02 +08:00
# 4. 校验归档
log "4️ ⃣ 校验归档..."
if [ ! -f " $( report_archive_markdown_path " $REPORT_DATE " ) " ] || [ ! -f " $( report_archive_html_path " $REPORT_DATE " ) " ] ; then
error_exit "日报归档失败"
fi
2026-05-13 14:42:45 +08:00
log "✅ 归档完成"
2026-05-13 20:13:02 +08:00
# 5. 校验运行记录
log "5️ ⃣ 校验运行记录..."
if ! psql " $DB_URL " -Atqc " select count(*) from daily_report where report_date = DATE ' ${ REPORT_DATE } ' and status = 'generated'; " | awk '{ exit !($1 >= 1) }' ; then
error_exit "daily_report 未写入 generated 记录"
fi
if ! psql " $DB_URL " -Atqc " select count(*) from report_runs where report_date = DATE ' ${ REPORT_DATE } ' and status = 'generated'; " | awk '{ exit !($1 >= 1) }' ; then
error_exit "report_runs 未写入 generated 记录"
fi
2026-05-13 14:42:45 +08:00
log "✅ 日报记录更新完成"
log "🎉 每日流水线全部完成!"
2026-05-13 20:13:02 +08:00
log " 📄 Markdown: $( report_markdown_path " $REPORT_DATE " ) "
log " 🌐 HTML: $( report_html_path " $REPORT_DATE " ) "
2026-05-13 14:42:45 +08:00
exit 0