Fix Services & Update the middleware && others
This commit is contained in:
@@ -5,93 +5,179 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gemini-balancer/internal/db/dialect"
|
||||
"gemini-balancer/internal/models"
|
||||
"gemini-balancer/internal/store"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"gemini-balancer/internal/db/dialect"
|
||||
"gemini-balancer/internal/models"
|
||||
"gemini-balancer/internal/settings"
|
||||
"gemini-balancer/internal/store"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
const (
|
||||
flushLoopInterval = 1 * time.Minute
|
||||
defaultFlushInterval = 1 * time.Minute
|
||||
maxRetryAttempts = 3
|
||||
retryDelay = 5 * time.Second
|
||||
)
|
||||
|
||||
type AnalyticsServiceLogger struct{ *logrus.Entry }
|
||||
|
||||
type AnalyticsService struct {
|
||||
db *gorm.DB
|
||||
store store.Store
|
||||
logger *logrus.Entry
|
||||
db *gorm.DB
|
||||
store store.Store
|
||||
logger *logrus.Entry
|
||||
dialect dialect.DialectAdapter
|
||||
settingsManager *settings.SettingsManager
|
||||
|
||||
stopChan chan struct{}
|
||||
wg sync.WaitGroup
|
||||
dialect dialect.DialectAdapter
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
|
||||
// 统计指标
|
||||
eventsReceived atomic.Uint64
|
||||
eventsProcessed atomic.Uint64
|
||||
eventsFailed atomic.Uint64
|
||||
flushCount atomic.Uint64
|
||||
recordsFlushed atomic.Uint64
|
||||
flushErrors atomic.Uint64
|
||||
lastFlushTime time.Time
|
||||
lastFlushMutex sync.RWMutex
|
||||
|
||||
// 运行时配置
|
||||
flushInterval time.Duration
|
||||
configMutex sync.RWMutex
|
||||
}
|
||||
|
||||
func NewAnalyticsService(db *gorm.DB, s store.Store, logger *logrus.Logger, d dialect.DialectAdapter) *AnalyticsService {
|
||||
func NewAnalyticsService(
|
||||
db *gorm.DB,
|
||||
s store.Store,
|
||||
logger *logrus.Logger,
|
||||
d dialect.DialectAdapter,
|
||||
settingsManager *settings.SettingsManager,
|
||||
) *AnalyticsService {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
return &AnalyticsService{
|
||||
db: db,
|
||||
store: s,
|
||||
logger: logger.WithField("component", "Analytics📊"),
|
||||
stopChan: make(chan struct{}),
|
||||
dialect: d,
|
||||
db: db,
|
||||
store: s,
|
||||
logger: logger.WithField("component", "Analytics📊"),
|
||||
dialect: d,
|
||||
settingsManager: settingsManager,
|
||||
stopChan: make(chan struct{}),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
flushInterval: defaultFlushInterval,
|
||||
lastFlushTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AnalyticsService) Start() {
|
||||
s.wg.Add(2)
|
||||
go s.flushLoop()
|
||||
s.wg.Add(3)
|
||||
go s.eventListener()
|
||||
s.logger.Info("AnalyticsService (Command Side) started.")
|
||||
go s.flushLoop()
|
||||
go s.metricsReporter()
|
||||
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"flush_interval": s.flushInterval,
|
||||
}).Info("AnalyticsService started")
|
||||
}
|
||||
|
||||
func (s *AnalyticsService) Stop() {
|
||||
s.logger.Info("AnalyticsService stopping...")
|
||||
close(s.stopChan)
|
||||
s.cancel()
|
||||
s.wg.Wait()
|
||||
s.logger.Info("AnalyticsService stopped. Performing final data flush...")
|
||||
|
||||
s.logger.Info("Performing final data flush...")
|
||||
s.flushToDB()
|
||||
s.logger.Info("AnalyticsService final data flush completed.")
|
||||
|
||||
// 输出最终统计
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"events_received": s.eventsReceived.Load(),
|
||||
"events_processed": s.eventsProcessed.Load(),
|
||||
"events_failed": s.eventsFailed.Load(),
|
||||
"flush_count": s.flushCount.Load(),
|
||||
"records_flushed": s.recordsFlushed.Load(),
|
||||
"flush_errors": s.flushErrors.Load(),
|
||||
}).Info("AnalyticsService stopped")
|
||||
}
|
||||
|
||||
// 事件监听循环
|
||||
func (s *AnalyticsService) eventListener() {
|
||||
defer s.wg.Done()
|
||||
sub, err := s.store.Subscribe(context.Background(), models.TopicRequestFinished)
|
||||
|
||||
sub, err := s.store.Subscribe(s.ctx, models.TopicRequestFinished)
|
||||
if err != nil {
|
||||
s.logger.Fatalf("Failed to subscribe to topic %s: %v", models.TopicRequestFinished, err)
|
||||
s.logger.WithError(err).Error("Failed to subscribe to request events, analytics disabled")
|
||||
return
|
||||
}
|
||||
defer sub.Close()
|
||||
s.logger.Info("AnalyticsService subscribed to request events.")
|
||||
defer func() {
|
||||
if err := sub.Close(); err != nil {
|
||||
s.logger.WithError(err).Warn("Failed to close subscription")
|
||||
}
|
||||
}()
|
||||
|
||||
s.logger.Info("Subscribed to request events for analytics")
|
||||
|
||||
for {
|
||||
select {
|
||||
case msg := <-sub.Channel():
|
||||
var event models.RequestFinishedEvent
|
||||
if err := json.Unmarshal(msg.Payload, &event); err != nil {
|
||||
s.logger.Errorf("Failed to unmarshal analytics event: %v", err)
|
||||
continue
|
||||
}
|
||||
s.handleAnalyticsEvent(&event)
|
||||
s.handleMessage(msg)
|
||||
|
||||
case <-s.stopChan:
|
||||
s.logger.Info("AnalyticsService stopping event listener.")
|
||||
s.logger.Info("Event listener stopping")
|
||||
return
|
||||
|
||||
case <-s.ctx.Done():
|
||||
s.logger.Info("Event listener context cancelled")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AnalyticsService) handleAnalyticsEvent(event *models.RequestFinishedEvent) {
|
||||
if event.RequestLog.GroupID == nil {
|
||||
// 处理单条消息
|
||||
func (s *AnalyticsService) handleMessage(msg *store.Message) {
|
||||
var event models.RequestFinishedEvent
|
||||
if err := json.Unmarshal(msg.Payload, &event); err != nil {
|
||||
s.logger.WithError(err).Error("Failed to unmarshal analytics event")
|
||||
s.eventsFailed.Add(1)
|
||||
return
|
||||
}
|
||||
ctx := context.Background()
|
||||
key := fmt.Sprintf("analytics:hourly:%s", time.Now().UTC().Format("2006-01-02T15"))
|
||||
|
||||
s.eventsReceived.Add(1)
|
||||
|
||||
if err := s.handleAnalyticsEvent(&event); err != nil {
|
||||
s.eventsFailed.Add(1)
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"correlation_id": event.CorrelationID,
|
||||
"group_id": event.RequestLog.GroupID,
|
||||
}).WithError(err).Warn("Failed to process analytics event")
|
||||
} else {
|
||||
s.eventsProcessed.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
// 处理分析事件
|
||||
func (s *AnalyticsService) handleAnalyticsEvent(event *models.RequestFinishedEvent) error {
|
||||
if event.RequestLog.GroupID == nil {
|
||||
return nil // 跳过无 GroupID 的事件
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
now := time.Now().UTC()
|
||||
key := fmt.Sprintf("analytics:hourly:%s", now.Format("2006-01-02T15"))
|
||||
fieldPrefix := fmt.Sprintf("%d:%s", *event.RequestLog.GroupID, event.RequestLog.ModelName)
|
||||
|
||||
pipe := s.store.Pipeline(ctx)
|
||||
pipe.HIncrBy(key, fieldPrefix+":requests", 1)
|
||||
|
||||
if event.RequestLog.IsSuccess {
|
||||
pipe.HIncrBy(key, fieldPrefix+":success", 1)
|
||||
}
|
||||
@@ -101,80 +187,213 @@ func (s *AnalyticsService) handleAnalyticsEvent(event *models.RequestFinishedEve
|
||||
if event.RequestLog.CompletionTokens > 0 {
|
||||
pipe.HIncrBy(key, fieldPrefix+":completion", int64(event.RequestLog.CompletionTokens))
|
||||
}
|
||||
|
||||
// 设置过期时间(保留48小时)
|
||||
pipe.Expire(key, 48*time.Hour)
|
||||
|
||||
if err := pipe.Exec(); err != nil {
|
||||
s.logger.Warnf("[%s] Failed to record analytics event to store for group %d: %v", event.CorrelationID, *event.RequestLog.GroupID, err)
|
||||
return fmt.Errorf("redis pipeline failed: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// 刷新循环
|
||||
func (s *AnalyticsService) flushLoop() {
|
||||
defer s.wg.Done()
|
||||
ticker := time.NewTicker(flushLoopInterval)
|
||||
|
||||
s.configMutex.RLock()
|
||||
interval := s.flushInterval
|
||||
s.configMutex.RUnlock()
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
s.logger.WithField("interval", interval).Info("Flush loop started")
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
s.flushToDB()
|
||||
|
||||
case <-s.stopChan:
|
||||
s.logger.Info("Flush loop stopping")
|
||||
return
|
||||
|
||||
case <-s.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 刷写到数据库
|
||||
func (s *AnalyticsService) flushToDB() {
|
||||
ctx := context.Background()
|
||||
start := time.Now()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
now := time.Now().UTC()
|
||||
keysToFlush := []string{
|
||||
fmt.Sprintf("analytics:hourly:%s", now.Add(-1*time.Hour).Format("2006-01-02T15")),
|
||||
fmt.Sprintf("analytics:hourly:%s", now.Format("2006-01-02T15")),
|
||||
}
|
||||
keysToFlush := s.generateFlushKeys(now)
|
||||
|
||||
totalRecords := 0
|
||||
totalErrors := 0
|
||||
|
||||
for _, key := range keysToFlush {
|
||||
data, err := s.store.HGetAll(ctx, key)
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
records, err := s.flushSingleKey(ctx, key, now)
|
||||
if err != nil {
|
||||
s.logger.WithError(err).WithField("key", key).Error("Failed to flush key")
|
||||
totalErrors++
|
||||
s.flushErrors.Add(1)
|
||||
} else {
|
||||
totalRecords += records
|
||||
}
|
||||
}
|
||||
|
||||
statsToFlush, parsedFields := s.parseStatsFromHash(now.Truncate(time.Hour), data)
|
||||
s.recordsFlushed.Add(uint64(totalRecords))
|
||||
s.flushCount.Add(1)
|
||||
|
||||
if len(statsToFlush) > 0 {
|
||||
upsertClause := s.dialect.OnConflictUpdateAll(
|
||||
[]string{"time", "group_id", "model_name"},
|
||||
[]string{"request_count", "success_count", "prompt_tokens", "completion_tokens"},
|
||||
)
|
||||
err := s.db.WithContext(ctx).Clauses(upsertClause).Create(&statsToFlush).Error
|
||||
if err != nil {
|
||||
s.logger.Errorf("Failed to flush analytics data for key %s: %v", key, err)
|
||||
} else {
|
||||
s.logger.Infof("Successfully flushed %d records from key %s.", len(statsToFlush), key)
|
||||
_ = s.store.HDel(ctx, key, parsedFields...)
|
||||
}
|
||||
}
|
||||
s.lastFlushMutex.Lock()
|
||||
s.lastFlushTime = time.Now()
|
||||
s.lastFlushMutex.Unlock()
|
||||
|
||||
duration := time.Since(start)
|
||||
|
||||
if totalRecords > 0 || totalErrors > 0 {
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"records_flushed": totalRecords,
|
||||
"keys_processed": len(keysToFlush),
|
||||
"errors": totalErrors,
|
||||
"duration": duration,
|
||||
}).Info("Analytics data flush completed")
|
||||
} else {
|
||||
s.logger.WithField("duration", duration).Debug("Analytics flush completed (no data)")
|
||||
}
|
||||
}
|
||||
|
||||
// 生成需要刷新的 Redis 键
|
||||
func (s *AnalyticsService) generateFlushKeys(now time.Time) []string {
|
||||
keys := make([]string, 0, 4)
|
||||
|
||||
// 当前小时
|
||||
keys = append(keys, fmt.Sprintf("analytics:hourly:%s", now.Format("2006-01-02T15")))
|
||||
|
||||
// 前3个小时(处理延迟和时区问题)
|
||||
for i := 1; i <= 3; i++ {
|
||||
pastHour := now.Add(-time.Duration(i) * time.Hour)
|
||||
keys = append(keys, fmt.Sprintf("analytics:hourly:%s", pastHour.Format("2006-01-02T15")))
|
||||
}
|
||||
|
||||
return keys
|
||||
}
|
||||
|
||||
// 刷写单个 Redis 键
|
||||
func (s *AnalyticsService) flushSingleKey(ctx context.Context, key string, baseTime time.Time) (int, error) {
|
||||
data, err := s.store.HGetAll(ctx, key)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to get hash data: %w", err)
|
||||
}
|
||||
|
||||
if len(data) == 0 {
|
||||
return 0, nil // 无数据,跳过
|
||||
}
|
||||
|
||||
// 解析时间戳
|
||||
hourStr := strings.TrimPrefix(key, "analytics:hourly:")
|
||||
recordTime, err := time.Parse("2006-01-02T15", hourStr)
|
||||
if err != nil {
|
||||
s.logger.WithError(err).WithField("key", key).Warn("Failed to parse time from key")
|
||||
recordTime = baseTime.Truncate(time.Hour)
|
||||
}
|
||||
|
||||
statsToFlush, parsedFields := s.parseStatsFromHash(recordTime, data)
|
||||
|
||||
if len(statsToFlush) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// 使用事务 + 重试机制
|
||||
var dbErr error
|
||||
for attempt := 1; attempt <= maxRetryAttempts; attempt++ {
|
||||
dbErr = s.upsertStatsWithTransaction(ctx, statsToFlush)
|
||||
if dbErr == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if attempt < maxRetryAttempts {
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"attempt": attempt,
|
||||
"key": key,
|
||||
}).WithError(dbErr).Warn("Database upsert failed, retrying...")
|
||||
time.Sleep(retryDelay)
|
||||
}
|
||||
}
|
||||
|
||||
if dbErr != nil {
|
||||
return 0, fmt.Errorf("failed to upsert after %d attempts: %w", maxRetryAttempts, dbErr)
|
||||
}
|
||||
|
||||
// 删除已处理的字段
|
||||
if len(parsedFields) > 0 {
|
||||
if err := s.store.HDel(ctx, key, parsedFields...); err != nil {
|
||||
s.logger.WithError(err).WithField("key", key).Warn("Failed to delete flushed fields from Redis")
|
||||
}
|
||||
}
|
||||
|
||||
return len(statsToFlush), nil
|
||||
}
|
||||
|
||||
// 使用事务批量 upsert
|
||||
func (s *AnalyticsService) upsertStatsWithTransaction(ctx context.Context, stats []models.StatsHourly) error {
|
||||
return s.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||||
upsertClause := s.dialect.OnConflictUpdateAll(
|
||||
[]string{"time", "group_id", "model_name"},
|
||||
[]string{"request_count", "success_count", "prompt_tokens", "completion_tokens"},
|
||||
)
|
||||
return tx.Clauses(upsertClause).Create(&stats).Error
|
||||
})
|
||||
}
|
||||
|
||||
// 解析 Redis Hash 数据
|
||||
func (s *AnalyticsService) parseStatsFromHash(t time.Time, data map[string]string) ([]models.StatsHourly, []string) {
|
||||
tempAggregator := make(map[string]*models.StatsHourly)
|
||||
var parsedFields []string
|
||||
parsedFields := make([]string, 0, len(data))
|
||||
|
||||
for field, valueStr := range data {
|
||||
parts := strings.Split(field, ":")
|
||||
if len(parts) != 3 {
|
||||
s.logger.WithField("field", field).Warn("Invalid field format")
|
||||
continue
|
||||
}
|
||||
groupIDStr, modelName, counterType := parts[0], parts[1], parts[2]
|
||||
|
||||
groupIDStr, modelName, counterType := parts[0], parts[1], parts[2]
|
||||
aggKey := groupIDStr + ":" + modelName
|
||||
|
||||
if _, ok := tempAggregator[aggKey]; !ok {
|
||||
gid, err := strconv.Atoi(groupIDStr)
|
||||
if err != nil {
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"field": field,
|
||||
"group_id": groupIDStr,
|
||||
}).Warn("Invalid group ID")
|
||||
continue
|
||||
}
|
||||
|
||||
tempAggregator[aggKey] = &models.StatsHourly{
|
||||
Time: t,
|
||||
GroupID: uint(gid),
|
||||
ModelName: modelName,
|
||||
}
|
||||
}
|
||||
val, _ := strconv.ParseInt(valueStr, 10, 64)
|
||||
|
||||
val, err := strconv.ParseInt(valueStr, 10, 64)
|
||||
if err != nil {
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"field": field,
|
||||
"value": valueStr,
|
||||
}).Warn("Invalid counter value")
|
||||
continue
|
||||
}
|
||||
|
||||
switch counterType {
|
||||
case "requests":
|
||||
tempAggregator[aggKey].RequestCount = val
|
||||
@@ -184,14 +403,92 @@ func (s *AnalyticsService) parseStatsFromHash(t time.Time, data map[string]strin
|
||||
tempAggregator[aggKey].PromptTokens = val
|
||||
case "completion":
|
||||
tempAggregator[aggKey].CompletionTokens = val
|
||||
default:
|
||||
s.logger.WithField("counter_type", counterType).Warn("Unknown counter type")
|
||||
continue
|
||||
}
|
||||
|
||||
parsedFields = append(parsedFields, field)
|
||||
}
|
||||
var result []models.StatsHourly
|
||||
|
||||
result := make([]models.StatsHourly, 0, len(tempAggregator))
|
||||
for _, stats := range tempAggregator {
|
||||
if stats.RequestCount > 0 {
|
||||
result = append(result, *stats)
|
||||
}
|
||||
}
|
||||
|
||||
return result, parsedFields
|
||||
}
|
||||
|
||||
// 定期输出统计信息
|
||||
func (s *AnalyticsService) metricsReporter() {
|
||||
defer s.wg.Done()
|
||||
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
s.reportMetrics()
|
||||
case <-s.stopChan:
|
||||
return
|
||||
case <-s.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AnalyticsService) reportMetrics() {
|
||||
s.lastFlushMutex.RLock()
|
||||
lastFlush := s.lastFlushTime
|
||||
s.lastFlushMutex.RUnlock()
|
||||
|
||||
received := s.eventsReceived.Load()
|
||||
processed := s.eventsProcessed.Load()
|
||||
failed := s.eventsFailed.Load()
|
||||
|
||||
var successRate float64
|
||||
if received > 0 {
|
||||
successRate = float64(processed) / float64(received) * 100
|
||||
}
|
||||
|
||||
s.logger.WithFields(logrus.Fields{
|
||||
"events_received": received,
|
||||
"events_processed": processed,
|
||||
"events_failed": failed,
|
||||
"success_rate": fmt.Sprintf("%.2f%%", successRate),
|
||||
"flush_count": s.flushCount.Load(),
|
||||
"records_flushed": s.recordsFlushed.Load(),
|
||||
"flush_errors": s.flushErrors.Load(),
|
||||
"last_flush_ago": time.Since(lastFlush).Round(time.Second),
|
||||
}).Info("Analytics metrics")
|
||||
}
|
||||
|
||||
// GetMetrics 返回当前统计指标(供监控使用)
|
||||
func (s *AnalyticsService) GetMetrics() map[string]interface{} {
|
||||
s.lastFlushMutex.RLock()
|
||||
lastFlush := s.lastFlushTime
|
||||
s.lastFlushMutex.RUnlock()
|
||||
|
||||
received := s.eventsReceived.Load()
|
||||
processed := s.eventsProcessed.Load()
|
||||
|
||||
var successRate float64
|
||||
if received > 0 {
|
||||
successRate = float64(processed) / float64(received) * 100
|
||||
}
|
||||
|
||||
return map[string]interface{}{
|
||||
"events_received": received,
|
||||
"events_processed": processed,
|
||||
"events_failed": s.eventsFailed.Load(),
|
||||
"success_rate": successRate,
|
||||
"flush_count": s.flushCount.Load(),
|
||||
"records_flushed": s.recordsFlushed.Load(),
|
||||
"flush_errors": s.flushErrors.Load(),
|
||||
"last_flush_ago": time.Since(lastFlush).Seconds(),
|
||||
"flush_interval": s.flushInterval.Seconds(),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user