Files
llm-intelligence/scripts/import_phase2_data.go

446 lines
13 KiB
Go
Raw Normal View History

//go:build llm_script && !scripts_pkg
package main
import (
"database/sql"
"encoding/json"
"fmt"
"log"
"os"
"strings"
"time"
_ "github.com/lib/pq"
)
type RawData struct {
Zhipu []struct {
Model string `json:"model"`
Context string `json:"context"`
InputPrice string `json:"inputPrice"`
OutputPrice string `json:"outputPrice"`
Operator string `json:"operator"`
Region string `json:"region"`
Currency string `json:"currency"`
} `json:"zhipu"`
Baidu []struct {
Model string `json:"model"`
Type string `json:"type"`
InputPrice *float64 `json:"inputPrice"`
OutputPrice *float64 `json:"outputPrice"`
Operator string `json:"operator"`
Region string `json:"region"`
Currency string `json:"currency"`
} `json:"baidu"`
}
type ModelPricing struct {
ModelID string
ModelName string
ProviderName string
ProviderCountry string
OperatorName string
OperatorType string
Region string
Currency string
InputPrice float64
OutputPrice float64
ContextLength int
IsFree bool
SourceURL string
ModelSourceURL string
ReleaseDate string
DateConfidence string
DateSourceKind string
Modality string
SceneTags []string
}
func releaseDateValue(raw string) any {
if strings.TrimSpace(raw) == "" {
return nil
}
parsed, err := time.Parse("2006-01-02", raw)
if err != nil {
return nil
}
return parsed
}
type baiduModelMetadata struct {
Prefix string
ReleaseDate string
ModelSourceURL string
DateConfidence string
DateSourceKind string
}
var baiduModelMetadataRules = []baiduModelMetadata{
{
Prefix: "baidu-ernie-5.0",
ReleaseDate: "2026-01-22",
ModelSourceURL: "https://cloud.baidu.com/news/news_eacd0f0b-0ca3-4963-aec8-5e6b9ebef9ba",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-x1.1",
ReleaseDate: "2025-09-09",
ModelSourceURL: "https://cloud.baidu.com/news/news_be713ff4-8477-4852-88f1-9cc56c406d6a",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-5.1",
ReleaseDate: "2026-05-09",
ModelSourceURL: "https://ernie.baidu.com/blog/posts/ernie-5.1-0508-release/",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-4.5-turbo-vl",
ReleaseDate: "2025-08-07",
ModelSourceURL: "https://cloud.baidu.com/product/wenxinworkshop.html",
DateConfidence: "secondary_authoritative",
DateSourceKind: "secondary_authoritative_report",
},
{
Prefix: "baidu-ernie-4.5-turbo",
ReleaseDate: "2025-04-25",
ModelSourceURL: "https://cloud.baidu.com/article/3887765",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-x1-turbo",
ReleaseDate: "2025-04-25",
ModelSourceURL: "https://cloud.baidu.com/article/3887765",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-4.5",
ReleaseDate: "2025-03-16",
ModelSourceURL: "https://cloud.baidu.com/article/3835921",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-x1",
ReleaseDate: "2025-03-16",
ModelSourceURL: "https://cloud.baidu.com/article/3835921",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-character",
ReleaseDate: "2024-03-22",
ModelSourceURL: "https://cloud.baidu.com/news/news_667c065f-0bd7-475d-98c2-901763d0ee77",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-lite-pro",
ReleaseDate: "2024-03-22",
ModelSourceURL: "https://cloud.baidu.com/news/news_667c065f-0bd7-475d-98c2-901763d0ee77",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-ernie-speed-pro",
ReleaseDate: "2024-03-22",
ModelSourceURL: "https://cloud.baidu.com/news/news_667c065f-0bd7-475d-98c2-901763d0ee77",
DateConfidence: "official_primary",
DateSourceKind: "official_announcement",
},
{
Prefix: "baidu-qianfan-",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
{
Prefix: "baidu-deepseek-",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
{
Prefix: "baidu-glm-",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
{
Prefix: "baidu-qwen",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
{
Prefix: "baidu-minimax-",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
{
Prefix: "baidu-kimi-",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
{
Prefix: "baidu-internvl",
DateConfidence: "unknown",
DateSourceKind: "catalog_backfill",
},
}
func enrichBaiduModelMetadata(model ModelPricing) ModelPricing {
normalizedID := strings.ToLower(model.ModelID)
for _, metadata := range baiduModelMetadataRules {
if strings.HasPrefix(normalizedID, metadata.Prefix) {
if metadata.ReleaseDate != "" {
model.ReleaseDate = metadata.ReleaseDate
}
if metadata.ModelSourceURL != "" {
model.ModelSourceURL = metadata.ModelSourceURL
}
if metadata.DateConfidence != "" {
model.DateConfidence = metadata.DateConfidence
}
if metadata.DateSourceKind != "" {
model.DateSourceKind = metadata.DateSourceKind
}
return model
}
}
if model.ModelSourceURL == "" {
model.ModelSourceURL = model.SourceURL
}
if model.DateConfidence == "" {
model.DateConfidence = "unknown"
}
if model.DateSourceKind == "" {
model.DateSourceKind = "unknown"
}
return model
}
func hasExplicitModelMetadata(model ModelPricing) bool {
return strings.TrimSpace(model.ReleaseDate) != "" ||
firstNonEmpty(model.ModelSourceURL) != "" && model.ModelSourceURL != model.SourceURL ||
strings.TrimSpace(model.DateConfidence) != "" && model.DateConfidence != "unknown" ||
strings.TrimSpace(model.DateSourceKind) != "" && model.DateSourceKind != "unknown"
}
func parseZhipuPrice(s string) float64 {
// Extract price from strings like "6元", "免费", "限时免费"
if strings.Contains(s, "免费") {
return 0
}
var f float64
fmt.Sscanf(s, "%f", &f)
return f
}
func extractContextLength(context string) int {
if strings.Contains(context, "1M") || strings.Contains(context, "1000K") {
return 1000000
}
if strings.Contains(context, "200K") {
return 200000
}
if strings.Contains(context, "128K") {
return 128000
}
if strings.Contains(context, "32K") {
return 32000
}
if strings.Contains(context, "8K") {
return 8000
}
if strings.Contains(context, "262144") || strings.Contains(context, "256K") {
return 262144
}
if strings.Contains(context, "8192") {
return 8192
}
return 0
}
func main() {
dsn := os.Getenv("DATABASE_URL")
if dsn == "" {
dsn = "postgres://long@/llm_intelligence?host=/var/run/postgresql"
}
db, err := sql.Open("postgres", dsn)
if err != nil {
log.Fatal(err)
}
defer db.Close()
// Read raw data
data, err := os.ReadFile("/tmp/phase2_raw_data.json")
if err != nil {
log.Fatal("Failed to read raw data:", err)
}
var raw RawData
if err := json.Unmarshal(data, &raw); err != nil {
log.Fatal("Failed to parse raw data:", err)
}
var prices []ModelPricing
batchID := "manual-seed"
// Process Baidu data
modelPrices := make(map[string]map[string]float64) // model -> type -> price
for _, b := range raw.Baidu {
if modelPrices[b.Model] == nil {
modelPrices[b.Model] = make(map[string]float64)
}
if b.InputPrice != nil {
if strings.Contains(b.Type, "输入") {
modelPrices[b.Model]["input"] = *b.InputPrice * 1000000 // Convert to per 1M
}
if strings.Contains(b.Type, "输出") {
modelPrices[b.Model]["output"] = *b.InputPrice * 1000000
}
}
if b.OutputPrice != nil {
if strings.Contains(b.Type, "输出") {
modelPrices[b.Model]["output"] = *b.OutputPrice * 1000000
}
}
}
for model, pricesMap := range modelPrices {
prices = append(prices, enrichBaiduModelMetadata(ModelPricing{
ModelID: "baidu-" + strings.ToLower(strings.ReplaceAll(model, " ", "-")),
ModelName: model,
ProviderName: "Baidu",
ProviderCountry: "CN",
OperatorName: "Baidu Qianfan",
OperatorType: "official",
Region: "CN",
Currency: "CNY",
InputPrice: pricesMap["input"],
OutputPrice: pricesMap["output"],
IsFree: pricesMap["input"] == 0 && pricesMap["output"] == 0,
SourceURL: "https://cloud.baidu.com/doc/qianfan/s/wmh4sv6ya",
Modality: "text",
}))
}
log.Printf("Parsed %d unique models from Baidu", len(prices))
// Save to database
for _, p := range prices {
// Find or create provider
var providerID int64
err := db.QueryRow("SELECT id FROM model_provider WHERE name = $1", p.ProviderName).Scan(&providerID)
if err == sql.ErrNoRows {
err = db.QueryRow(
"INSERT INTO model_provider (name, country, website, status) VALUES ($1, $2, $3, 'active') RETURNING id",
p.ProviderName, p.ProviderCountry, "",
).Scan(&providerID)
}
if err != nil {
log.Printf("Provider error: %v", err)
continue
}
// Find or create operator
var operatorID int64
err = db.QueryRow("SELECT id FROM operator WHERE name = $1", p.OperatorName).Scan(&operatorID)
if err == sql.ErrNoRows {
err = db.QueryRow(
"INSERT INTO operator (name, country, status) VALUES ($1, $2, 'active') RETURNING id",
p.OperatorName, p.ProviderCountry,
).Scan(&operatorID)
}
if err != nil {
log.Printf("Operator error: %v", err)
continue
}
// Find or create model
var modelID int64
err = db.QueryRow("SELECT id FROM models WHERE external_id = $1", p.ModelID).Scan(&modelID)
if err == sql.ErrNoRows {
err = db.QueryRow(
`INSERT INTO models (external_id, name, provider_id, modality, context_length, status, source, batch_id, source_url, release_date, date_confidence, date_source_kind)
VALUES ($1, $2, $3, $4, $5, 'active', $6, $7, $8, $9, $10, $11) RETURNING id`,
p.ModelID, p.ModelName, providerID, p.Modality, p.ContextLength, p.OperatorName, batchID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate), p.DateConfidence, p.DateSourceKind,
).Scan(&modelID)
}
if err != nil {
log.Printf("Model error: %v", err)
continue
}
if _, err := db.Exec(
`UPDATE models
SET source_url = CASE
WHEN $4 THEN $2
ELSE COALESCE(NULLIF(source_url, ''), $2)
END,
release_date = CASE
WHEN $4 THEN $3::date
ELSE COALESCE(release_date, $3::date)
END,
date_confidence = CASE
WHEN $4 THEN $5
ELSE COALESCE(NULLIF(date_confidence, ''), $5, 'unknown')
END,
date_source_kind = CASE
WHEN $4 THEN $6
ELSE COALESCE(NULLIF(date_source_kind, ''), $6, 'unknown')
END,
updated_at = CURRENT_TIMESTAMP
WHERE id = $1`,
modelID, firstNonEmpty(p.ModelSourceURL, p.SourceURL), releaseDateValue(p.ReleaseDate), hasExplicitModelMetadata(p), p.DateConfidence, p.DateSourceKind,
); err != nil {
log.Printf("Model metadata update error for %s: %v", p.ModelID, err)
}
// Insert pricing
sourceType := p.OperatorType
freeQuota := ""
freeLimitations := "[]"
rateLimit := "{}"
if p.IsFree {
sourceType = "free_tier"
freeQuota = "Imported free-tier pricing entry"
freeLimitations = `["See source_url for current quota and policy"]`
}
_, err = db.Exec(
`INSERT INTO region_pricing
(model_id, operator_id, region, currency, input_price_per_mtok, output_price_per_mtok, is_free, effective_date, source_url, source_type, free_quota, free_limitations, rate_limit)
VALUES ($1, $2, $3, $4, $5, $6, $7, CURRENT_DATE, $8, $9, $10, $11, $12)
ON CONFLICT (model_id, operator_id, region, currency, effective_date)
DO UPDATE SET input_price_per_mtok = EXCLUDED.input_price_per_mtok,
output_price_per_mtok = EXCLUDED.output_price_per_mtok,
is_free = EXCLUDED.is_free,
source_type = EXCLUDED.source_type,
free_quota = EXCLUDED.free_quota,
free_limitations = EXCLUDED.free_limitations,
rate_limit = EXCLUDED.rate_limit,
updated_at = CURRENT_TIMESTAMP`,
modelID, operatorID, p.Region, p.Currency, p.InputPrice, p.OutputPrice, p.IsFree, p.SourceURL,
sourceType, freeQuota, freeLimitations, rateLimit,
)
if err != nil {
log.Printf("Pricing error for %s: %v", p.ModelID, err)
continue
}
}
log.Printf("Successfully imported %d models into database", len(prices))
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if value != "" {
return value
}
}
return ""
}