Add per-chunk download timing to telemetry

samikshya-db · samikshya-db · commit 33fcf7409380 · 2026-04-13T22:50:48.000+05:30
Time each FetchResults call in fetchResultPage() and propagate the
latency through the telemetryUpdate callback. Aggregate initial,
slowest, and sum chunk fetch latencies in the QueryContext closure,
then map them to the ChunkDetails wire fields:
  - initial_chunk_latency_millis
  - slowest_chunk_latency_millis
  - sum_chunks_download_time_millis

Also populate total_chunks_present from server-reported data:
  - DirectResults with CloseOperation set → 1 (all data inline)
  - CloudFetch ResultLinks → len(links) from first response

The existing total_chunks_iterated (chunk_count tag) is unchanged.

Co-authored-by: Isaac
diff --git a/connection.go b/connection.go
@@ -225,11 +225,42 @@ func (c *conn) QueryContext(ctx context.Context, query string, args []driver.Nam
 		return nil, dbsqlerrint.NewExecutionError(ctx, dbsqlerr.ErrQueryExecution, err, opStatusResp)
 	}
 
+	// Per-chunk timing state captured in the closure below.
+	var (
+		chunkTimingInitialMs  int64
+		chunkTimingSlowestMs  int64
+		chunkTimingSumMs      int64
+		chunkTimingInitialSet bool
+		chunkTotalPresent     int32
+	)
+
 	// Telemetry callback for tracking row fetching metrics
-	telemetryUpdate := func(chunkCount int, bytesDownloaded int64) {
-		if c.telemetry != nil {
-			c.telemetry.AddTag(ctx, "chunk_count", chunkCount)
-			c.telemetry.AddTag(ctx, "bytes_downloaded", bytesDownloaded)
+	telemetryUpdate := func(chunkCount int, bytesDownloaded int64, chunkIndex int, chunkLatencyMs int64, totalChunksPresent int32) {
+		if c.telemetry == nil {
+			return
+		}
+		c.telemetry.AddTag(ctx, "chunk_count", chunkCount)
+		c.telemetry.AddTag(ctx, "bytes_downloaded", bytesDownloaded)
+
+		// Aggregate per-chunk fetch latencies (skip direct results where latency is 0).
+		if chunkLatencyMs > 0 {
+			if !chunkTimingInitialSet {
+				chunkTimingInitialMs = chunkLatencyMs
+				chunkTimingInitialSet = true
+			}
+			if chunkLatencyMs > chunkTimingSlowestMs {
+				chunkTimingSlowestMs = chunkLatencyMs
+			}
+			chunkTimingSumMs += chunkLatencyMs
+			c.telemetry.AddTag(ctx, "chunk_initial_latency_ms", chunkTimingInitialMs)
+			c.telemetry.AddTag(ctx, "chunk_slowest_latency_ms", chunkTimingSlowestMs)
+			c.telemetry.AddTag(ctx, "chunk_sum_latency_ms", chunkTimingSumMs)
+		}
+
+		// Record total chunks present from first server report.
+		if totalChunksPresent > 0 && chunkTotalPresent == 0 {
+			chunkTotalPresent = totalChunksPresent
+			c.telemetry.AddTag(ctx, "chunk_total_present", int(chunkTotalPresent))
 		}
 	}
 
@@ -673,8 +704,8 @@ func (c *conn) execStagingOperation(
 	}
 
 	if len(driverctx.StagingPathsFromContext(ctx)) != 0 {
-		// Telemetry callback for staging operation row fetching
-		telemetryUpdate := func(chunkCount int, bytesDownloaded int64) {
+		// Telemetry callback for staging operation row fetching (chunk timing not tracked for staging ops).
+		telemetryUpdate := func(chunkCount int, bytesDownloaded int64, chunkIndex int, chunkLatencyMs int64, totalChunksPresent int32) {
 			if c.telemetry != nil {
 				c.telemetry.AddTag(ctx, "chunk_count", chunkCount)
 				c.telemetry.AddTag(ctx, "bytes_downloaded", bytesDownloaded)
diff --git a/internal/rows/rows.go b/internal/rows/rows.go
@@ -59,7 +59,13 @@ type rows struct {
 	ctx context.Context
 
 	// Telemetry tracking
-	telemetryUpdate func(chunkCount int, bytesDownloaded int64)
+	// telemetryUpdate is called after each chunk is fetched with:
+	//   chunkCount: total chunks fetched so far (including direct results)
+	//   bytesDownloaded: cumulative bytes
+	//   chunkIndex: 0-based index of the chunk just fetched
+	//   chunkLatencyMs: fetch latency for this chunk (0 for direct results)
+	//   totalChunksPresent: server-reported total, 0 if unknown
+	telemetryUpdate func(chunkCount int, bytesDownloaded int64, chunkIndex int, chunkLatencyMs int64, totalChunksPresent int32)
 	closeCallback   func(latencyMs int64, err error)
 	chunkCount      int
 	bytesDownloaded int64
@@ -78,7 +84,7 @@ func NewRows(
 	client cli_service.TCLIService,
 	config *config.Config,
 	directResults *cli_service.TSparkDirectResults,
-	telemetryUpdate func(chunkCount int, bytesDownloaded int64),
+	telemetryUpdate func(chunkCount int, bytesDownloaded int64, chunkIndex int, chunkLatencyMs int64, totalChunksPresent int32),
 	closeCallback func(latencyMs int64, err error),
 ) (driver.Rows, dbsqlerr.DBError) {
 
@@ -148,7 +154,18 @@ func NewRows(
 		}
 
 		if r.telemetryUpdate != nil {
-			r.telemetryUpdate(r.chunkCount, r.bytesDownloaded)
+			// Determine totalChunksPresent for direct results.
+			// If the server already closed the operation, all data is here (totalPresent=1).
+			// For CloudFetch direct results, use the number of result links.
+			var totalPresent int32
+			if directResults.CloseOperation != nil {
+				totalPresent = int32(r.chunkCount)
+			} else if directResults.ResultSet != nil && directResults.ResultSet.Results != nil &&
+				directResults.ResultSet.Results.ResultLinks != nil {
+				totalPresent = int32(len(directResults.ResultSet.Results.ResultLinks)) //nolint:gosec
+			}
+			// chunkIndex=0, chunkLatencyMs=0: direct results have no separate fetch latency.
+			r.telemetryUpdate(r.chunkCount, r.bytesDownloaded, 0, 0, totalPresent)
 		}
 	}
 
@@ -480,7 +497,11 @@ func (r *rows) fetchResultPage() error {
 		r.RowScanner = nil
 	}
 
+	// Record 0-based chunk index before fetching (direct results occupied index 0 if present).
+	chunkIndex := r.chunkCount
+	fetchStart := time.Now()
 	fetchResult, err1 := r.ResultPageIterator.Next()
+	chunkLatencyMs := time.Since(fetchStart).Milliseconds()
 	if err1 != nil {
 		return err1
 	}
@@ -494,8 +515,14 @@ func (r *rows) fetchResultPage() error {
 		}
 	}
 
+	// For CloudFetch, the result links in the response reveal the server-reported total.
+	var totalPresent int32
+	if fetchResult != nil && fetchResult.Results != nil && fetchResult.Results.ResultLinks != nil {
+		totalPresent = int32(len(fetchResult.Results.ResultLinks)) //nolint:gosec
+	}
+
 	if r.telemetryUpdate != nil {
-		r.telemetryUpdate(r.chunkCount, r.bytesDownloaded)
+		r.telemetryUpdate(r.chunkCount, r.bytesDownloaded, chunkIndex, chunkLatencyMs, totalPresent)
 	}
 
 	err1 = r.makeRowScanner(fetchResult)
diff --git a/telemetry/request.go b/telemetry/request.go
@@ -184,6 +184,18 @@ func createTelemetryRequest(metrics []*telemetryMetric, driverVersion string) (*
 				sqlOp.ChunkDetails = &ChunkDetails{
 					TotalChunksIterated: int32(chunkCount), //nolint:gosec // chunk count is always small
 				}
+				if v, ok := tags["chunk_initial_latency_ms"].(int64); ok && v > 0 {
+					sqlOp.ChunkDetails.InitialChunkLatencyMs = v
+				}
+				if v, ok := tags["chunk_slowest_latency_ms"].(int64); ok && v > 0 {
+					sqlOp.ChunkDetails.SlowestChunkLatencyMs = v
+				}
+				if v, ok := tags["chunk_sum_latency_ms"].(int64); ok && v > 0 {
+					sqlOp.ChunkDetails.SumChunksDownloadTimeMs = v
+				}
+				if v, ok := tags["chunk_total_present"].(int); ok && v > 0 {
+					sqlOp.ChunkDetails.TotalChunksPresent = int32(v) //nolint:gosec // chunk count is always small
+				}
 			}
 
 			if opType, ok := tags["operation_type"].(string); ok {
diff --git a/telemetry/tags.go b/telemetry/tags.go
@@ -21,6 +21,14 @@ const (
 	TagPollLatency           = "poll.latency_ms"
 )
 
+// Tag names for chunk timing metrics
+const (
+	TagChunkInitialLatencyMs = "chunk_initial_latency_ms"
+	TagChunkSlowestLatencyMs = "chunk_slowest_latency_ms"
+	TagChunkSumLatencyMs     = "chunk_sum_latency_ms"
+	TagChunkTotalPresent     = "chunk_total_present"
+)
+
 // Tag names for error metrics
 const (
 	TagErrorType = "error.type"
@@ -77,6 +85,10 @@ func statementTags() []tagDefinition {
 		{TagCompressionEnabled, exportDatabricks, "Compression enabled", false},
 		{TagPollCount, exportDatabricks, "Poll count", false},
 		{TagPollLatency, exportDatabricks, "Poll latency", false},
+		{TagChunkInitialLatencyMs, exportDatabricks, "Initial chunk fetch latency ms", false},
+		{TagChunkSlowestLatencyMs, exportDatabricks, "Slowest chunk fetch latency ms", false},
+		{TagChunkSumLatencyMs, exportDatabricks, "Sum of chunk fetch latencies ms", false},
+		{TagChunkTotalPresent, exportDatabricks, "Total chunks reported by server", false},
 	}
 }