Files
llm-intelligence/scripts/tencent_catalog_lib.go
phamnazage-jpg 6c3569fb65
Some checks failed
CI / go-test (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / docker-build (push) Has been cancelled
feat(pricing): add qwen hunyuan and huawei maas payg importers
2026-05-22 12:13:54 +08:00

343 lines
7.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build llm_script
package main
import (
"fmt"
"html"
"io"
"net/http"
"os"
"regexp"
"sort"
"strings"
"time"
)
const (
defaultTencentCatalogURL = "https://cloud.tencent.com/document/product/1823/130060"
)
var defaultTencentCatalogTimeout = 20 * time.Second
type fetchTencentCatalogConfig struct {
URL string
DryRun bool
Timeout time.Duration
Fixture string
}
type tencentCatalog struct {
UpdatedAt string
Plans []tencentPlan
Models []tencentModel
}
type tencentPlan struct {
Series string
Tier string
Quota string
Price string
BillingCycle string
Scene string
}
type tencentModel struct {
Series string
Name string
ModelID string
ContextLength int
Notes []string
}
func fetchTencentCatalogContent(cfg fetchTencentCatalogConfig, client *http.Client) (string, error) {
if strings.TrimSpace(cfg.Fixture) != "" {
data, err := os.ReadFile(cfg.Fixture)
if err != nil {
return "", err
}
return string(data), nil
}
req, err := http.NewRequest(http.MethodGet, cfg.URL, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", "llm-intelligence/tencent-catalog-fetcher")
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func parseTencentCatalog(raw string) (tencentCatalog, error) {
lines := normalizeTencentCatalogLines(raw)
var catalog tencentCatalog
var currentSeries string
var currentMode string
for i := 0; i < len(lines); i++ {
line := lines[i]
if catalog.UpdatedAt == "" {
if updatedAt := extractUpdatedAt(line); updatedAt != "" {
catalog.UpdatedAt = updatedAt
continue
}
}
if series := extractSeriesHeading(line); series != "" {
currentSeries = series
currentMode = ""
continue
}
switch line {
case "### 套餐详情", "套餐详情":
if currentSeries == "" {
continue
}
currentMode = "plans"
continue
case "### 可用模型", "可用模型":
if currentSeries == "" {
continue
}
currentMode = "models"
continue
}
switch currentMode {
case "plans":
plan, nextIndex, ok := tryParseTencentPlan(lines, i, currentSeries)
if ok {
catalog.Plans = append(catalog.Plans, plan)
i = nextIndex
}
case "models":
model, nextIndex, ok := tryParseTencentModel(lines, i, currentSeries)
if ok {
catalog.Models = append(catalog.Models, model)
i = nextIndex
}
}
}
if catalog.UpdatedAt == "" {
return tencentCatalog{}, fmt.Errorf("catalog updated_at not found")
}
if len(catalog.Plans) == 0 {
return tencentCatalog{}, fmt.Errorf("catalog plans not found")
}
if len(catalog.Models) == 0 {
return tencentCatalog{}, fmt.Errorf("catalog models not found")
}
return catalog, nil
}
func normalizeTencentCatalogLines(raw string) []string {
text := html.UnescapeString(raw)
replacements := []string{"<br>", "<br/>", "<br />", "</p>", "</div>", "</li>", "</tr>", "</td>", "</h1>", "</h2>", "</h3>", "</h4>", "</pre>", "</main>"}
for _, replacement := range replacements {
text = strings.ReplaceAll(text, replacement, "\n")
}
tagPattern := regexp.MustCompile(`<[^>]+>`)
text = tagPattern.ReplaceAllString(text, "")
text = strings.ReplaceAll(text, "\r\n", "\n")
text = strings.ReplaceAll(text, "\r", "\n")
rawLines := strings.Split(text, "\n")
lines := make([]string, 0, len(rawLines))
for _, rawLine := range rawLines {
line := strings.Trim(strings.TrimSpace(rawLine), "\uFEFF")
if line == "" {
continue
}
lines = append(lines, line)
}
return lines
}
func extractUpdatedAt(line string) string {
const prefix = "最近更新时间:"
if strings.HasPrefix(line, prefix) {
return strings.TrimSpace(strings.TrimPrefix(line, prefix))
}
return ""
}
func extractSeriesHeading(line string) string {
if !strings.HasPrefix(line, "## ") {
trimmed := strings.Trim(line, "\uFEFF ")
switch trimmed {
case "通用 Token Plan 套餐":
return "通用 Token Plan"
case "Hy Token Plan 套餐":
return "Hy Token Plan"
}
return ""
}
series := strings.TrimSpace(strings.TrimPrefix(line, "## "))
if strings.Contains(series, "Token Plan") || strings.Contains(series, "Coding Plan") {
return strings.TrimSpace(strings.TrimSuffix(series, "套餐"))
}
return ""
}
func tryParseTencentPlan(lines []string, start int, series string) (tencentPlan, int, bool) {
if !looksLikeTencentPlan(lines, start) {
return tencentPlan{}, start, false
}
plan := tencentPlan{
Series: series,
Tier: strings.Trim(lines[start+1], "() "),
BillingCycle: lines[start+2],
Quota: lines[start+3],
Price: lines[start+4],
}
nextIndex := start + 4
if start+5 < len(lines) && !strings.HasPrefix(lines[start+5], "### ") && !looksLikeTencentPlan(lines, start+5) {
plan.Scene = lines[start+5]
nextIndex = start + 5
}
return plan, nextIndex, true
}
func tryParseTencentModel(lines []string, start int, series string) (tencentModel, int, bool) {
if start+1 >= len(lines) {
return tencentModel{}, start, false
}
if !isTencentModelID(lines[start+1]) {
return tencentModel{}, start, false
}
if isReservedTencentLine(lines[start]) {
return tencentModel{}, start, false
}
model := tencentModel{
Series: series,
Name: lines[start],
ModelID: lines[start+1],
}
notes := make([]string, 0, 4)
nextIndex := start + 1
for i := start + 2; i < len(lines); i++ {
line := lines[i]
if strings.HasPrefix(line, "## ") || strings.HasPrefix(line, "### ") {
break
}
if looksLikeTencentPlan(lines, i) {
break
}
if i+1 < len(lines) && isTencentModelID(lines[i+1]) && !isReservedTencentLine(line) {
break
}
notes = append(notes, line)
nextIndex = i
}
model.Notes = notes
model.ContextLength = extractContextLength(strings.Join(notes, " "))
return model, nextIndex, true
}
func isTencentPlanTier(line string) bool {
return strings.HasPrefix(line, "") && strings.HasSuffix(line, "")
}
func looksLikeTencentPlan(lines []string, start int) bool {
if start+4 >= len(lines) {
return false
}
if isReservedTencentLine(lines[start]) {
return false
}
if isTencentModelID(lines[start]) {
return false
}
if !isTencentPlanTier(lines[start+1]) {
return false
}
if !strings.Contains(lines[start+2], "订阅月") {
return false
}
if !strings.Contains(lines[start+3], "Tokens") {
return false
}
return strings.Contains(lines[start+4], "元/月")
}
func isReservedTencentLine(line string) bool {
if strings.HasPrefix(line, "#") {
return true
}
switch line {
case "Token Plan 个人版套餐概览", "套餐详情", "可用模型":
return true
default:
return false
}
}
func isTencentModelID(line string) bool {
modelIDPattern := regexp.MustCompile(`^[a-z0-9][a-z0-9._-]*$`)
return modelIDPattern.MatchString(line)
}
func extractContextLength(text string) int {
contextPattern := regexp.MustCompile(`(?i)(\d+)\s*([KM])\s*上下文`)
matches := contextPattern.FindStringSubmatch(text)
if len(matches) != 3 {
return 0
}
value := 0
fmt.Sscanf(matches[1], "%d", &value)
switch strings.ToUpper(matches[2]) {
case "K":
return value * 1024
case "M":
return value * 1024 * 1024
default:
return 0
}
}
func formatSeriesSummary(plans []tencentPlan) string {
counts := make(map[string]int)
for _, plan := range plans {
counts[plan.Series]++
}
series := make([]string, 0, len(counts))
for name := range counts {
series = append(series, name)
}
sort.Strings(series)
parts := make([]string, 0, len(series))
for _, name := range series {
parts = append(parts, fmt.Sprintf("%s:%d", name, counts[name]))
}
return strings.Join(parts, ",")
}