Delay EXECUTE_STATEMENT metric emission to rows.Close()

samikshya-db · samikshya-db · commit 242fad288f9b · 2026-04-13T23:45:11.000+05:30
Previously AfterExecute/CompleteStatement fired when QueryContext()
returned — before the user ever called rows.Next(). This meant
chunk_count was always 1 and all per-chunk timing fields were null,
because fetchResultPage() calls happen during row iteration.

Fix:
- Add FinalizeLatency() to Interceptor: captures elapsed time at
  QueryContext return to preserve execute-only latency in mc.
- AfterExecute() uses the pre-captured latency if available, so the
  metric reports server-exec+poll time regardless of when it fires.
- Move AfterExecute/CompleteStatement from a defer in QueryContext to
  closeCallback, which rows.Close() invokes after all rows are read.
  At that point chunk_count and all timing tags are fully accumulated.
- Error path: still emits EXECUTE_STATEMENT immediately on runQuery
  failure (no rows means no chunks to wait for).

Co-authored-by: Isaac
diff --git a/connection.go b/connection.go
@@ -207,25 +207,34 @@ func (c *conn) QueryContext(ctx context.Context, query string, args []driver.Nam
 	log, ctx = client.LoggerAndContext(ctx, exStmtResp)
 	defer log.Duration(msg, start)
 
-	// Telemetry: track statement execution
+	// Telemetry: set up metric context for the statement.
+	// BeforeExecuteWithTime anchors startTime to before runQuery() ran.
 	var statementID string
 	if c.telemetry != nil && exStmtResp != nil && exStmtResp.OperationHandle != nil && exStmtResp.OperationHandle.OperationId != nil {
 		statementID = client.SprintGuid(exStmtResp.OperationHandle.OperationId.GUID)
-		// Use BeforeExecuteWithTime to set the correct start time (before execution)
 		ctx = c.telemetry.BeforeExecuteWithTime(ctx, c.id, statementID, executeStart)
 		c.telemetry.AddTag(ctx, "operation_type", telemetry.OperationTypeExecuteStatement)
-		defer func() {
-			c.telemetry.AfterExecute(ctx, err)
-			c.telemetry.CompleteStatement(ctx, statementID, err != nil)
-		}()
 	}
 
 	if err != nil {
+		// Error path: finalize and emit the EXECUTE_STATEMENT metric immediately —
+		// there are no rows to iterate so the metric is complete right now.
+		if c.telemetry != nil && statementID != "" {
+			c.telemetry.AfterExecute(ctx, err)
+			c.telemetry.CompleteStatement(ctx, statementID, true)
+		}
 		log.Err(err).Msg("databricks: failed to run query") // To log query we need to redact credentials
 		return nil, dbsqlerrint.NewExecutionError(ctx, dbsqlerr.ErrQueryExecution, err, opStatusResp)
 	}
 
-	// Per-chunk timing state captured in the closure below.
+	// Success path: freeze execute latency NOW (before row iteration inflates time.Since).
+	// AfterExecute/CompleteStatement are called from closeCallback after all chunks
+	// are fetched, so the final metric carries complete chunk timing data.
+	if c.telemetry != nil && statementID != "" {
+		c.telemetry.FinalizeLatency(ctx)
+	}
+
+	// Per-chunk timing state accumulated across all fetchResultPage calls.
 	var (
 		chunkTimingInitialMs  int64
 		chunkTimingSlowestMs  int64
@@ -234,7 +243,7 @@ func (c *conn) QueryContext(ctx context.Context, query string, args []driver.Nam
 		chunkTotalPresent     int32
 	)
 
-	// Telemetry callback for tracking row fetching metrics
+	// Telemetry callback invoked after each result page is fetched.
 	telemetryUpdate := func(chunkCount int, bytesDownloaded int64, chunkIndex int, chunkLatencyMs int64, totalChunksPresent int32) {
 		if c.telemetry == nil {
 			return
@@ -257,16 +266,29 @@ func (c *conn) QueryContext(ctx context.Context, query string, args []driver.Nam
 			c.telemetry.AddTag(ctx, "chunk_sum_latency_ms", chunkTimingSumMs)
 		}
 
-		// Record total chunks present from first server report.
+		// Record server-reported total chunks from first non-zero report.
 		if totalChunksPresent > 0 && chunkTotalPresent == 0 {
 			chunkTotalPresent = totalChunksPresent
 			c.telemetry.AddTag(ctx, "chunk_total_present", int(chunkTotalPresent))
 		}
 	}
 
-	// Telemetry callback for CLOSE_STATEMENT — fired from rows.Close()
+	// closeCallback is invoked from rows.Close() after all rows have been consumed.
+	// At that point chunk timing is fully accumulated in ctx tags, so we finalize
+	// EXECUTE_STATEMENT here rather than at QueryContext return time.
 	var closeCallback func(latencyMs int64, err error)
-	if c.telemetry != nil {
+	if c.telemetry != nil && statementID != "" {
+		interceptor := c.telemetry
+		connID := c.id
+		stmtID := statementID
+		closeCallback = func(latencyMs int64, closeErr error) {
+			// Emit EXECUTE_STATEMENT with complete chunk data now that iteration is done.
+			interceptor.AfterExecute(ctx, nil)
+			interceptor.CompleteStatement(ctx, stmtID, false)
+			// Emit CLOSE_STATEMENT as a separate operation event.
+			interceptor.RecordOperation(ctx, connID, telemetry.OperationTypeCloseStatement, latencyMs, closeErr)
+		}
+	} else if c.telemetry != nil {
 		interceptor := c.telemetry
 		connID := c.id
 		closeCallback = func(latencyMs int64, closeErr error) {
diff --git a/telemetry/interceptor.go b/telemetry/interceptor.go
@@ -20,6 +20,12 @@ type metricContext struct {
 	statementID string
 	startTime   time.Time
 	tags        map[string]interface{}
+
+	// capturedLatencyMs is set by FinalizeLatency() to freeze the execute-phase
+	// latency before row iteration begins.  AfterExecute uses this value instead
+	// of re-measuring from startTime (which would include row-scan time).
+	capturedLatencyMs int64
+	latencyCaptured   bool
 }
 
 type contextKey int
@@ -83,6 +89,22 @@ func (i *Interceptor) BeforeExecuteWithTime(ctx context.Context, sessionID strin
 	return withMetricContext(ctx, mc)
 }
 
+// FinalizeLatency freezes the elapsed time as the statement's execution latency.
+// Call this when the execute phase is complete (i.e. when QueryContext returns) so
+// that AfterExecute, even if called later from rows.Close(), still reports
+// execute-only latency rather than total latency that would include row iteration.
+// Exported for use by the driver package.
+func (i *Interceptor) FinalizeLatency(ctx context.Context) {
+	if !i.enabled {
+		return
+	}
+	mc := getMetricContext(ctx)
+	if mc != nil && !mc.latencyCaptured {
+		mc.capturedLatencyMs = time.Since(mc.startTime).Milliseconds()
+		mc.latencyCaptured = true
+	}
+}
+
 // AfterExecute is called after statement execution.
 // Records the metric with timing and error information.
 // Exported for use by the driver package.
@@ -103,12 +125,20 @@ func (i *Interceptor) AfterExecute(ctx context.Context, err error) {
 		}
 	}()
 
+	// Use pre-captured latency if available (set by FinalizeLatency), otherwise
+	// fall back to measuring from startTime (covers the error-path where
+	// FinalizeLatency was never called).
+	latencyMs := time.Since(mc.startTime).Milliseconds()
+	if mc.latencyCaptured {
+		latencyMs = mc.capturedLatencyMs
+	}
+
 	metric := &telemetryMetric{
 		metricType:  "statement",
 		timestamp:   mc.startTime,
 		sessionID:   mc.sessionID,
 		statementID: mc.statementID,
-		latencyMs:   time.Since(mc.startTime).Milliseconds(),
+		latencyMs:   latencyMs,
 		tags:        mc.tags,
 	}