Files
llm-intelligence/scripts/vertex_pricing_lib.go

278 lines
8.2 KiB
Go
Raw Normal View History

//go:build llm_script
package main
import (
"fmt"
"html"
"regexp"
"strings"
)
const defaultVertexPricingURL = "https://cloud.google.com/gemini-enterprise-agent-platform/generative-ai/pricing"
var (
vertexRowPattern = regexp.MustCompile(`(?s)<tr>(.*?)</tr>`)
vertexCellPattern = regexp.MustCompile(`(?s)<t[dh][^>]*>(.*?)</t[dh]>`)
vertexHeadingPattern = regexp.MustCompile(`(?is)<h[2-4][^>]*>(.*?)</h[2-4]>`)
vertexTablePattern = regexp.MustCompile(`(?is)<table[^>]*>(.*?)</table>`)
vertexStandardHeadingPattern = regexp.MustCompile(`(?is)<h[2-5][^>]*>\s*(standard|标准)\s*</h[2-5]>`)
)
func parseVertexPricingCatalog(raw string) ([]officialPricingRecord, error) {
familyBlocks := splitVertexFamilyBlocks(raw)
records := make([]officialPricingRecord, 0)
if len(familyBlocks) > 0 {
for _, block := range familyBlocks {
tableHTML := extractVertexStandardTable(block)
if strings.TrimSpace(tableHTML) == "" {
continue
}
records = append(records, parseVertexStandardTable(tableHTML)...)
}
}
if len(records) > 0 {
return records, nil
}
records = parseVertexStandardTextBlocks(raw)
if len(records) > 0 {
return records, nil
}
if len(familyBlocks) == 0 {
return nil, fmt.Errorf("unexpected vertex pricing content")
}
return nil, fmt.Errorf("no vertex standard pricing rows found")
}
func parseVertexStandardTable(table string) []officialPricingRecord {
rows := vertexRowPattern.FindAllStringSubmatch(table, -1)
records := make([]officialPricingRecord, 0)
currentModel := ""
currentInput := 0.0
for _, row := range rows {
cells := vertexCellPattern.FindAllStringSubmatch(row[1], -1)
if len(cells) == 0 {
continue
}
values := make([]string, 0, len(cells))
for _, cell := range cells {
values = append(values, cleanHTMLText(cell[1]))
}
if len(values) == 1 && !strings.Contains(values[0], "Model") {
currentModel = values[0]
currentInput = 0
continue
}
if len(values) < 2 {
continue
}
rowType := values[0]
priceCell := values[1]
if len(values) > 2 && strings.Contains(strings.ToLower(values[0]), "gemini") {
currentModel = values[0]
rowType = values[1]
priceCell = values[2]
}
if strings.TrimSpace(currentModel) == "" || strings.EqualFold(currentModel, "Model") {
continue
}
switch {
case strings.HasPrefix(rowType, "Input (text"), strings.HasPrefix(rowType, "输入(文本"):
price, ok := firstDollarPrice(priceCell)
if ok {
currentInput = price
}
case strings.HasPrefix(rowType, "Text output"), strings.HasPrefix(rowType, "文本输出"):
outputPrice, ok := firstDollarPrice(priceCell)
if !ok || currentInput == 0 {
continue
}
providerNameCn, providerCountry, providerWebsite := providerMetadata("Google")
record := officialPricingRecord{
ModelID: normalizeExternalID("vertex", currentModel),
ModelName: currentModel,
ProviderName: "Google",
ProviderNameCn: providerNameCn,
ProviderCountry: providerCountry,
ProviderWebsite: providerWebsite,
OperatorName: "Google Cloud Vertex AI",
OperatorNameCn: "Google Cloud Vertex AI",
OperatorCountry: "US",
OperatorWebsite: "https://cloud.google.com/vertex-ai",
OperatorType: "cloud",
Region: "global",
Currency: "USD",
InputPrice: currentInput,
OutputPrice: outputPrice,
SourceURL: defaultVertexPricingURL,
ModelSourceURL: defaultVertexPricingURL,
DateConfidence: "unknown",
DateSourceKind: "official_pricing",
Modality: detectModality(currentModel),
}
record.IsFree = record.InputPrice == 0 && record.OutputPrice == 0
records = append(records, record)
}
}
return records
}
func splitVertexFamilyBlocks(raw string) []string {
indices := make([]int, 0)
matches := vertexHeadingPattern.FindAllStringSubmatchIndex(raw, -1)
for _, match := range matches {
label := cleanHTMLText(raw[match[2]:match[3]])
if !strings.Contains(strings.ToLower(label), "gemini") {
continue
}
indices = append(indices, match[0])
}
blocks := make([]string, 0, len(indices))
for i, start := range indices {
end := len(raw)
if i+1 < len(indices) {
end = indices[i+1]
}
blocks = append(blocks, raw[start:end])
}
return blocks
}
func extractVertexStandardTable(raw string) string {
heading := vertexStandardHeadingPattern.FindStringIndex(raw)
if heading == nil {
return ""
}
segment := raw[heading[1]:]
table := vertexTablePattern.FindStringSubmatch(segment)
if len(table) != 2 {
return ""
}
return table[1]
}
func parseVertexStandardTextBlocks(raw string) []officialPricingRecord {
lines := htmlLines(raw)
records := make([]officialPricingRecord, 0)
currentModelParts := make([]string, 0)
currentInput := 0.0
inStandard := false
for _, line := range lines {
lower := strings.ToLower(line)
sectionTitle := normalizeVertexSectionTitle(lower)
switch {
case sectionTitle != "":
inStandard = sectionTitle == "standard" || sectionTitle == "标准"
currentModelParts = currentModelParts[:0]
currentInput = 0
continue
case !inStandard:
continue
case strings.Contains(lower, "model type price"):
continue
case strings.Contains(line, "$"):
modelName := strings.TrimSpace(strings.Join(currentModelParts, " "))
if modelName == "" {
continue
}
switch {
case strings.HasPrefix(lower, "input (text"), strings.HasPrefix(lower, "1m input text tokens"):
if price, ok := firstDollarPrice(line); ok {
currentInput = price
}
case strings.HasPrefix(lower, "text output"), strings.HasPrefix(lower, "1m output text tokens"):
outputPrice, ok := firstDollarPrice(line)
if !ok || currentInput == 0 {
continue
}
providerNameCn, providerCountry, providerWebsite := providerMetadata("Google")
record := officialPricingRecord{
ModelID: normalizeExternalID("vertex", modelName),
ModelName: modelName,
ProviderName: "Google",
ProviderNameCn: providerNameCn,
ProviderCountry: providerCountry,
ProviderWebsite: providerWebsite,
OperatorName: "Google Cloud Vertex AI",
OperatorNameCn: "Google Cloud Vertex AI",
OperatorCountry: "US",
OperatorWebsite: "https://cloud.google.com/vertex-ai",
OperatorType: "cloud",
Region: "global",
Currency: "USD",
InputPrice: currentInput,
OutputPrice: outputPrice,
SourceURL: defaultVertexPricingURL,
ModelSourceURL: defaultVertexPricingURL,
DateConfidence: "unknown",
DateSourceKind: "official_pricing",
Modality: detectModality(modelName),
}
record.IsFree = record.InputPrice == 0 && record.OutputPrice == 0
records = append(records, record)
currentModelParts = currentModelParts[:0]
currentInput = 0
}
default:
currentModelParts = append(currentModelParts, line)
}
}
return dedupeOfficialPricingRecords(records)
}
func normalizeVertexSectionTitle(line string) string {
title := strings.TrimSpace(strings.TrimLeft(line, "#"))
title = strings.TrimSpace(title)
switch title {
case "standard", "标准", "priority", "优先级", "flex/batch", "灵活/批处理", "batch api", "live api":
return title
default:
return ""
}
}
func htmlLines(raw string) []string {
replacer := strings.NewReplacer(
"<br>", "\n",
"<br/>", "\n",
"<br />", "\n",
"</p>", "\n",
"</div>", "\n",
"</section>", "\n",
"</tr>", "\n",
"</td>", "\n",
"</th>", "\n",
"</li>", "\n",
"</h1>", "\n",
"</h2>", "\n",
"</h3>", "\n",
"</h4>", "\n",
"</h5>", "\n",
"</h6>", "\n",
)
withBreaks := replacer.Replace(raw)
tagPattern := regexp.MustCompile(`(?is)<[^>]+>`)
spacePattern := regexp.MustCompile(`[ \t]+`)
cleaned := html.UnescapeString(withBreaks)
cleaned = strings.ReplaceAll(cleaned, "\r\n", "\n")
cleaned = strings.ReplaceAll(cleaned, "\r", "\n")
cleaned = strings.ReplaceAll(cleaned, "\u00a0", " ")
cleaned = tagPattern.ReplaceAllString(cleaned, "")
rawLines := strings.Split(cleaned, "\n")
lines := make([]string, 0, len(rawLines))
for _, line := range rawLines {
line = strings.TrimSpace(spacePattern.ReplaceAllString(line, " "))
if line == "" {
continue
}
lines = append(lines, line)
}
return lines
}