379 lines
12 KiB
Go
379 lines
12 KiB
Go
//go:build llm_script
|
|
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"os/exec"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
defaultSensenovaDocsURL = "https://platform.sensenova.cn/docs"
|
|
defaultSensenovaModelsURL = "https://www.sensenova.cn/models"
|
|
)
|
|
|
|
type sensenovaPricingImportConfig struct {
|
|
DocsURL string
|
|
ModelsURL string
|
|
Fixture string
|
|
DryRun bool
|
|
Timeout time.Duration
|
|
}
|
|
|
|
type sensenovaPricingFixture struct {
|
|
DocsHTML string
|
|
ModelsText string
|
|
}
|
|
|
|
type sensenovaPricingDocModel struct {
|
|
ModelName string
|
|
ModelID string
|
|
QuotaPer5Hour int
|
|
}
|
|
|
|
var (
|
|
sensenovaFixtureSplitMarker = "\n===SENSENOVA_MODELS_BUNDLE===\n"
|
|
sensenovaOverviewCardPattern = regexp.MustCompile(`(?s)<h4[^>]*>([^<]+)</h4>.*?调用次数限制</p><p[^>]*>每5小时([0-9]+)次</p>.*?MODEL ID</p><code[^>]*>([^<]+)</code>`)
|
|
sensenovaModelsScriptPattern = regexp.MustCompile(`src="([^"]+/_next/static/chunks/[^"]+\.js|/_next/static/chunks/[^"]+\.js)"`)
|
|
sensenovaPricingZeroPattern = regexp.MustCompile(`(?s)"pricing"\s*:\s*\{\s*"prompt"\s*:\s*"0"\s*,\s*"completion"\s*:\s*"0"\s*,\s*"image"\s*:\s*"0"\s*,\s*"request"\s*:\s*"0"`)
|
|
)
|
|
|
|
func main() {
|
|
loadSubscriptionImportEnv()
|
|
|
|
var docsURL string
|
|
var modelsURL string
|
|
var fixture string
|
|
var dryRun bool
|
|
var timeoutSeconds int
|
|
|
|
flag.StringVar(&docsURL, "docs-url", defaultSensenovaDocsURL, "商汤 SenseNova API 文档页")
|
|
flag.StringVar(&modelsURL, "models-url", defaultSensenovaModelsURL, "商汤 SenseNova 模型页")
|
|
flag.StringVar(&fixture, "fixture", "", "商汤 SenseNova 价格样例文件")
|
|
flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库")
|
|
flag.IntVar(&timeoutSeconds, "timeout", 45, "请求超时(秒)")
|
|
flag.Parse()
|
|
|
|
cfg := sensenovaPricingImportConfig{
|
|
DocsURL: docsURL,
|
|
ModelsURL: modelsURL,
|
|
Fixture: fixture,
|
|
DryRun: dryRun,
|
|
Timeout: time.Duration(timeoutSeconds) * time.Second,
|
|
}
|
|
|
|
var db *sql.DB
|
|
var err error
|
|
if !cfg.DryRun {
|
|
db, err = subscriptionImportDB()
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
}
|
|
|
|
if err := runSensenovaPricingImport(cfg, db, os.Stdout); err != nil {
|
|
fmt.Fprintf(os.Stderr, "import_sensenova_pricing: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
func runSensenovaPricingImport(cfg sensenovaPricingImportConfig, db *sql.DB, out io.Writer) error {
|
|
fixture, err := fetchSensenovaPricingFixture(cfg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
records, err := parseSensenovaPricingCatalog(fixture)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
records = dedupeOfficialPricingRecords(records)
|
|
if len(records) == 0 {
|
|
return fmt.Errorf("unexpected sensenova pricing content: no records")
|
|
}
|
|
if cfg.DryRun {
|
|
_, err = fmt.Fprintf(out, "source=sensenova-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName)
|
|
return err
|
|
}
|
|
if db == nil {
|
|
return fmt.Errorf("db is required when dry-run=false")
|
|
}
|
|
if err := upsertOfficialPricingRecords(db, records, "sensenova-pricing-import"); err != nil {
|
|
return err
|
|
}
|
|
var tableRows int
|
|
if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil {
|
|
return fmt.Errorf("count region_pricing: %w", err)
|
|
}
|
|
_, err = fmt.Fprintf(out, "source=sensenova-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows)
|
|
return err
|
|
}
|
|
|
|
func fetchSensenovaPricingFixture(cfg sensenovaPricingImportConfig) (sensenovaPricingFixture, error) {
|
|
if strings.TrimSpace(cfg.Fixture) != "" {
|
|
data, err := os.ReadFile(cfg.Fixture)
|
|
if err != nil {
|
|
return sensenovaPricingFixture{}, fmt.Errorf("read fixture %s: %w", cfg.Fixture, err)
|
|
}
|
|
return splitSensenovaFixture(string(data))
|
|
}
|
|
|
|
docsHTML, err := fetchRenderedPricingPageWithChromium(cfg.DocsURL, cfg.Timeout)
|
|
if err != nil {
|
|
return sensenovaPricingFixture{}, fmt.Errorf("fetch docs render: %w", err)
|
|
}
|
|
modelsText, err := fetchSensenovaModelsBundle(cfg.ModelsURL, cfg.Timeout)
|
|
if err != nil {
|
|
return sensenovaPricingFixture{}, err
|
|
}
|
|
return sensenovaPricingFixture{DocsHTML: docsHTML, ModelsText: modelsText}, nil
|
|
}
|
|
|
|
func splitSensenovaFixture(raw string) (sensenovaPricingFixture, error) {
|
|
parts := strings.SplitN(raw, sensenovaFixtureSplitMarker, 2)
|
|
if len(parts) != 2 {
|
|
return sensenovaPricingFixture{}, fmt.Errorf("unexpected sensenova fixture: missing models bundle marker")
|
|
}
|
|
docsHTML := strings.TrimSpace(parts[0])
|
|
modelsText := strings.TrimSpace(parts[1])
|
|
if docsHTML == "" || modelsText == "" {
|
|
return sensenovaPricingFixture{}, fmt.Errorf("unexpected sensenova fixture: empty docs or models segment")
|
|
}
|
|
return sensenovaPricingFixture{DocsHTML: docsHTML, ModelsText: modelsText}, nil
|
|
}
|
|
|
|
func fetchSensenovaModelsBundle(modelsURL string, timeout time.Duration) (string, error) {
|
|
client := &http.Client{Timeout: timeout}
|
|
html, err := fetchRawPricingPage(modelsURL, "", client)
|
|
if err != nil {
|
|
return "", fmt.Errorf("fetch models page shell: %w", err)
|
|
}
|
|
scripts := sensenovaModelsScriptPattern.FindAllStringSubmatch(html, -1)
|
|
if len(scripts) == 0 {
|
|
return "", fmt.Errorf("unexpected sensenova models page: no chunk scripts found")
|
|
}
|
|
seen := make(map[string]struct{}, len(scripts))
|
|
for _, match := range scripts {
|
|
if len(match) != 2 {
|
|
continue
|
|
}
|
|
scriptURL, err := resolveSensenovaAssetURL(modelsURL, match[1])
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if _, ok := seen[scriptURL]; ok {
|
|
continue
|
|
}
|
|
seen[scriptURL] = struct{}{}
|
|
bundle, err := fetchRawPricingPage(scriptURL, "", client)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if sensenovaBundleConfirmsFreeBeta(bundle) {
|
|
return bundle, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("unexpected sensenova models page: free-beta bundle not found")
|
|
}
|
|
|
|
func resolveSensenovaAssetURL(baseURL string, assetPath string) (string, error) {
|
|
parsedBase, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
asset, err := url.Parse(assetPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return parsedBase.ResolveReference(asset).String(), nil
|
|
}
|
|
|
|
func sensenovaBundleConfirmsFreeBeta(raw string) bool {
|
|
hasFree := strings.Contains(raw, "公测期完全免费开放") || strings.Contains(raw, "free during public beta")
|
|
hasAllModels := strings.Contains(raw, "所有模型完全开放") || strings.Contains(raw, "all models included")
|
|
return hasFree && hasAllModels
|
|
}
|
|
|
|
func fetchRenderedPricingPageWithChromium(pageURL string, timeout time.Duration) (string, error) {
|
|
browserPath, err := lookupChromiumBinaryForSensenova()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
defer cancel()
|
|
cmd := exec.CommandContext(ctx, browserPath,
|
|
"--headless",
|
|
"--no-sandbox",
|
|
"--disable-gpu",
|
|
"--virtual-time-budget=8000",
|
|
"--dump-dom",
|
|
pageURL,
|
|
)
|
|
cmd.Stderr = io.Discard
|
|
out, err := cmd.Output()
|
|
if ctx.Err() == context.DeadlineExceeded {
|
|
return "", fmt.Errorf("chromium render timeout after %s", timeout)
|
|
}
|
|
if err != nil {
|
|
return "", fmt.Errorf("chromium dump-dom: %w", err)
|
|
}
|
|
if len(out) == 0 {
|
|
return "", fmt.Errorf("chromium dump-dom returned empty output")
|
|
}
|
|
return string(out), nil
|
|
}
|
|
|
|
func lookupChromiumBinaryForSensenova() (string, error) {
|
|
for _, name := range []string{"chromium", "chromium-browser", "google-chrome", "google-chrome-stable"} {
|
|
if path, err := exec.LookPath(name); err == nil {
|
|
return path, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("no chromium-compatible browser found in PATH")
|
|
}
|
|
|
|
func parseSensenovaPricingCatalog(fixture sensenovaPricingFixture) ([]officialPricingRecord, error) {
|
|
if !sensenovaBundleConfirmsFreeBeta(fixture.ModelsText) {
|
|
return nil, fmt.Errorf("unexpected sensenova models bundle: missing public-beta free signal")
|
|
}
|
|
if !strings.Contains(fixture.DocsHTML, "GET https://token.sensenova.cn/v1/models") {
|
|
return nil, fmt.Errorf("unexpected sensenova docs content: missing list models endpoint")
|
|
}
|
|
if !sensenovaPricingZeroPattern.MatchString(fixture.DocsHTML) {
|
|
return nil, fmt.Errorf("unexpected sensenova docs content: missing zero pricing object example")
|
|
}
|
|
|
|
matches := sensenovaOverviewCardPattern.FindAllStringSubmatch(fixture.DocsHTML, -1)
|
|
if len(matches) == 0 {
|
|
return nil, fmt.Errorf("unexpected sensenova docs content: no model overview cards parsed")
|
|
}
|
|
|
|
providerNameCn, providerCountry, providerWebsite := providerMetadata("SenseTime")
|
|
records := make([]officialPricingRecord, 0, len(matches))
|
|
seenModelIDs := make(map[string]struct{}, len(matches))
|
|
for _, match := range matches {
|
|
if len(match) != 4 {
|
|
continue
|
|
}
|
|
modelName := strings.TrimSpace(match[1])
|
|
modelID := strings.TrimSpace(match[3])
|
|
if modelName == "" || modelID == "" {
|
|
continue
|
|
}
|
|
if _, ok := seenModelIDs[modelID]; ok {
|
|
continue
|
|
}
|
|
seenModelIDs[modelID] = struct{}{}
|
|
sectionID := sensenovaSectionIDForModel(modelID)
|
|
section, err := extractHTMLSectionByID(fixture.DocsHTML, sectionID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
providerName := sensenovaProviderName(modelID)
|
|
providerCn, providerCountryCode, providerSite := providerNameCn, providerCountry, providerWebsite
|
|
if providerName != "SenseTime" {
|
|
providerCn, providerCountryCode, providerSite = providerMetadata(providerName)
|
|
}
|
|
records = append(records, officialPricingRecord{
|
|
ModelID: normalizeExternalID("sensenova", modelID),
|
|
ModelName: modelName,
|
|
ProviderName: providerName,
|
|
ProviderNameCn: providerCn,
|
|
ProviderCountry: providerCountryCode,
|
|
ProviderWebsite: providerSite,
|
|
OperatorName: "SenseNova API",
|
|
OperatorNameCn: "日日新开放平台",
|
|
OperatorCountry: "CN",
|
|
OperatorWebsite: defaultSensenovaDocsURL,
|
|
OperatorType: "official",
|
|
Region: "CN",
|
|
Currency: "CNY",
|
|
InputPrice: 0,
|
|
OutputPrice: 0,
|
|
IsFree: true,
|
|
ContextLength: sensenovaContextLength(modelID, section),
|
|
SourceURL: defaultSensenovaDocsURL,
|
|
ModelSourceURL: firstNonEmptyText(defaultSensenovaDocsURL+"#"+sectionID, defaultSensenovaDocsURL),
|
|
DateConfidence: "unknown",
|
|
DateSourceKind: "official_pricing",
|
|
Modality: sensenovaModality(modelID, section),
|
|
})
|
|
}
|
|
if len(records) == 0 {
|
|
return nil, fmt.Errorf("unexpected sensenova pricing content: empty records after parse")
|
|
}
|
|
return records, nil
|
|
}
|
|
|
|
func extractHTMLSectionByID(raw string, sectionID string) (string, error) {
|
|
marker := fmt.Sprintf(`<section id="%s"`, sectionID)
|
|
start := strings.Index(raw, marker)
|
|
if start == -1 {
|
|
return "", fmt.Errorf("unexpected sensenova docs content: missing section %s", sectionID)
|
|
}
|
|
remaining := raw[start:]
|
|
next := strings.Index(remaining[len(marker):], "<section id=")
|
|
if next == -1 {
|
|
return remaining, nil
|
|
}
|
|
return remaining[:len(marker)+next], nil
|
|
}
|
|
|
|
func sensenovaSectionIDForModel(modelID string) string {
|
|
switch modelID {
|
|
case "sensenova-6.7-flash-lite":
|
|
return "model-flash"
|
|
case "sensenova-u1-fast":
|
|
return "model-u1"
|
|
case "deepseek-v4-flash":
|
|
return "model-deepseek-v4-flash"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func sensenovaProviderName(modelID string) string {
|
|
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(modelID)), "deepseek") {
|
|
return "DeepSeek"
|
|
}
|
|
return "SenseTime"
|
|
}
|
|
|
|
func sensenovaContextLength(modelID string, section string) int {
|
|
switch modelID {
|
|
case "sensenova-6.7-flash-lite", "deepseek-v4-flash":
|
|
if strings.Contains(section, "上下文长度 256K tokens") || strings.Contains(section, "256K 上下文") {
|
|
return 256 * 1024
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func sensenovaModality(modelID string, section string) string {
|
|
switch modelID {
|
|
case "sensenova-u1-fast":
|
|
if strings.Contains(section, "/v1/images/generations") {
|
|
return "image"
|
|
}
|
|
return "multimodal"
|
|
case "sensenova-6.7-flash-lite":
|
|
if strings.Contains(section, "图像输入理解") {
|
|
return "multimodal"
|
|
}
|
|
return "text"
|
|
default:
|
|
return "text"
|
|
}
|
|
}
|