nimbus-eth1-benchmarks/regenerate_readme.sh at master · status-im/nimbus-eth1-benchmarks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#!/usr/bin/env bash
set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
NIMBUS_ETH1_BENCHMARKS_REPO="${SCRIPT_DIR}"
README_FILE_PATH="${NIMBUS_ETH1_BENCHMARKS_REPO}/README.md"
README_TEMPLATE_PATH="${NIMBUS_ETH1_BENCHMARKS_REPO}/README-TEMPLATE.md"
VENV_DIR="${NIMBUS_ETH1_BENCHMARKS_REPO}/.venv"
NIMBUS_ETH1_REPO_URL="https://github.com/status-im/nimbus-eth1/commit"

SHORT_HISTORY_CSV="${NIMBUS_ETH1_BENCHMARKS_REPO}/short-benchmark-history.csv"
LONG_HISTORY_CSV="${NIMBUS_ETH1_BENCHMARKS_REPO}/long-benchmark-history.csv"
REGRESSIONS_CSV="${NIMBUS_ETH1_BENCHMARKS_REPO}/regressions.csv"
IMPROVEMENTS_CSV="${NIMBUS_ETH1_BENCHMARKS_REPO}/improvements.csv"

SYNC_MODE=false
if [[ "${1:-}" == "--sync" ]]; then
  SYNC_MODE=true
  echo "Sync mode: will purge stale entries"
fi

CSV_HEADER="Generated At,Baseline SHA,Contender SHA,Baseline Time,Contender Time,Time Delta"
MD_TABLE_HEADER="| Generated At | Baseline SHA | Contender SHA | Baseline Time | Contender Time | Time Delta |
|--------------|--------------|---------------|---------------|----------------|------------|"

format_timestamp_date() {
  echo "$1" | awk '{
      year=substr($0,1,4)
      month=substr($0,5,2)
      day=substr($0,7,2)
      hour=substr($0,10,2)
      min=substr($0,12,2)
      sec=substr($0,14,2)
      print year "-" month "-" day " " hour ":" min ":" sec
  }'
}

sha_to_link() {
  local sha="$1"
  echo "[${sha}](${NIMBUS_ETH1_REPO_URL}/${sha})"
}

csv_to_md_row() {
  local csv_line="$1"
  local timestamp baseline_sha contender_sha baseline_time contender_time time_delta
  local baseline_link contender_link

  timestamp=$(echo "$csv_line" | cut -d',' -f1)
  baseline_sha=$(echo "$csv_line" | cut -d',' -f2)
  contender_sha=$(echo "$csv_line" | cut -d',' -f3)
  baseline_time=$(echo "$csv_line" | cut -d',' -f4)
  contender_time=$(echo "$csv_line" | cut -d',' -f5)
  time_delta=$(echo "$csv_line" | grep -o '"[^"]*"' | tr -d '"')
  baseline_link=$(sha_to_link "$baseline_sha")
  contender_link=$(sha_to_link "$contender_sha")

  echo "| $timestamp | $baseline_link | $contender_link | $baseline_time | $contender_time | $time_delta |"
}

get_processed_dirs() {
  local csv_file="$1"
  local ts_compact
  if [[ -f "$csv_file" ]]; then
    # Extract timestamp and contender SHA, reconstruct dir name pattern
    tail -n +2 "$csv_file" | while IFS=',' read -r timestamp _baseline contender _rest; do
      # Convert "2026-01-03 20:18:17" back to "20260103T201817"
      ts_compact=$(echo "$timestamp" | sed 's/-//g; s/ /T/; s/://g')
      echo "${ts_compact}_${contender}"
    done
  fi
}

# Purge CSV entries for benchmarks that no longer exist on disk
purge_stale_entries() {
  local benchmark_type="$1"  # "short" or "long"
  local csv_file="$2"
  local benchmark_dir="${NIMBUS_ETH1_BENCHMARKS_REPO}/${benchmark_type}-benchmark"
  local tmp_file timestamp contender ts_compact dir_name dir_path
  local purged_count=0

  [[ -f "$csv_file" ]] || return 0

  tmp_file=$(mktemp)

  # Keep header
  head -1 "$csv_file" > "$tmp_file"

  # Check each entry
  while IFS= read -r line; do
    # Extract timestamp and contender SHA to reconstruct dir name
    timestamp=$(echo "$line" | cut -d',' -f1)
    contender=$(echo "$line" | cut -d',' -f3)
    ts_compact=$(echo "$timestamp" | sed 's/-//g; s/ /T/; s/://g')
    dir_name="${ts_compact}_${contender}"
    dir_path="${benchmark_dir}/${dir_name}"

    if [[ -d "$dir_path" ]]; then
      echo "$line" >> "$tmp_file"
    else
      echo "  Purged: $dir_name (directory not found)"
      ((purged_count++)) || true
    fi
  done < <(tail -n +2 "$csv_file")

  mv "$tmp_file" "$csv_file"
  echo "Purged $purged_count stale ${benchmark_type} benchmark(s)"
}

# Process a single benchmark log file
process_benchmark_file() {
  local file="$1"
  local dir_path dir_name raw_timestamp timestamp contender_git_sha
  local benchmark_type_dir baseline_git_sha prev_dir d_name
  local baseline_time contender_time time_delta
  local found_current=false

  dir_path=$(dirname "$file")
  dir_name=$(basename "$dir_path")
  raw_timestamp=$(echo "$dir_name" | grep -o '^[0-9]\{8\}T[0-9]\{6\}')
  timestamp=$(format_timestamp_date "$raw_timestamp")
  contender_git_sha=$(echo "$dir_name" | cut -d'_' -f2)
  benchmark_type_dir=$(dirname "$dir_path")
  baseline_git_sha=""
  prev_dir=""

  # Find previous directory
  for d in "$benchmark_type_dir"/*/; do
    d_name=$(basename "$d")
    [[ -L "${d%/}" ]] && continue
    if [[ "$found_current" == "true" ]]; then
      prev_dir="$d_name"
      break
    fi
    [[ "$d_name" == "$dir_name" ]] && found_current=true
  done

  [[ -n "$prev_dir" ]] && baseline_git_sha=$(echo "$prev_dir" | cut -d'_' -f2)

  # Fallback: extract from file content
  if [[ -z "$baseline_git_sha" ]]; then
    baseline_git_sha=$(grep "block-import-stats.py" "$file" 2>/dev/null | grep -o '[^/]*_[^/]*' | cut -d'_' -f2 | head -n 1) || true
  fi

  baseline_time=$(grep -o "baseline: [0-9hms]*" "$file" 2>/dev/null | cut -d' ' -f2) || true
  contender_time=$(grep -o "contender: [0-9hms]*" "$file" 2>/dev/null | cut -d' ' -f2) || true
  time_delta=$(grep "Time (total):" "$file" 2>/dev/null | sed 's/Time (total): \(.*\)/\1/' | tr -d '"') || true
  baseline_time=$(echo "$baseline_time" | tr -d '"')
  contender_time=$(echo "$contender_time" | tr -d '"')

  if [[ -n "$baseline_git_sha" && -n "$contender_git_sha" && -n "$time_delta" && -n "$baseline_time" && -n "$contender_time" ]]; then
    echo "$timestamp,$baseline_git_sha,$contender_git_sha,$baseline_time,$contender_time,\"$time_delta\""
  else
    echo "Warning: Could not extract data from $file" >&2
    return 1
  fi
}

# Process only new benchmarks and append to CSV
process_new_benchmarks() {
  local benchmark_type="$1"  # "short" or "long"
  local csv_file="$2"
  local benchmark_dir="${NIMBUS_ETH1_BENCHMARKS_REPO}/${benchmark_type}-benchmark"
  local dir_name entry
  local new_count=0
  local new_entries=""

  # Initialize CSV if it doesn't exist
  if [[ ! -f "$csv_file" ]]; then
    echo "$CSV_HEADER" > "$csv_file"
  fi

  # Get already processed directories
  local -A processed_dirs
  while IFS= read -r dir; do
    [[ -n "$dir" ]] && processed_dirs["$dir"]=1
  done < <(get_processed_dirs "$csv_file")

  for log_file in "$benchmark_dir"/*/build-environment.log; do
    [[ -f "$log_file" ]] || continue
    dir_name=$(basename "$(dirname "$log_file")")

    # Skip symlinks
    [[ -L "$(dirname "$log_file")" ]] && continue

    # Skip if already processed
    if [[ -n "${processed_dirs[$dir_name]:-}" ]]; then
      continue
    fi

    if entry=$(process_benchmark_file "$log_file"); then
      new_entries+="${entry}"$'\n'
      ((new_count++)) || true
      echo "  New: $dir_name"
    fi
  done

  if [[ -n "$new_entries" ]]; then
    echo -n "$new_entries" >> "$csv_file"
  fi

  echo "Processed $new_count new ${benchmark_type} benchmark(s)"
}

regenerate_derived_files() {
  local tmp_file time_delta percentage abs_pct is_sig

  # Sort CSVs by date (newest first) - need to re-sort after appending
  for csv_file in "$SHORT_HISTORY_CSV" "$LONG_HISTORY_CSV"; do
    if [[ -f "$csv_file" ]]; then
      tmp_file=$(mktemp)
      head -1 "$csv_file" > "$tmp_file"
      tail -n +2 "$csv_file" | sort -t',' -k1 -r >> "$tmp_file"
      mv "$tmp_file" "$csv_file"
    fi
  done

  echo "$CSV_HEADER" > "$REGRESSIONS_CSV"
  echo "$CSV_HEADER" > "$IMPROVEMENTS_CSV"

  filter_significant_changes() {
    local csv_file="$1"
    [[ -f "$csv_file" ]] || return 0
    tail -n +2 "$csv_file" | while IFS= read -r line; do
      time_delta=$(echo "$line" | grep -o '"[^"]*"' | tr -d '"')
      percentage=$(echo "$time_delta" | grep -oE '[-]?[0-9]+\.[0-9]+%' | tr -d '%')

      if [[ -n "$percentage" ]]; then
        abs_pct=$(awk -v p="$percentage" 'BEGIN { print (p<0) ? -p : p }')
        is_sig=$(awk -v p="$abs_pct" 'BEGIN { print (p>1) ? 1 : 0 }')

        if [[ "$is_sig" == "1" ]]; then
          if awk -v p="$percentage" 'BEGIN { exit (p>0) ? 0 : 1 }'; then
            echo "${percentage}|${line}" >> "${REGRESSIONS_CSV}.tmp"
          else
            echo "${percentage}|${line}" >> "${IMPROVEMENTS_CSV}.tmp"
          fi
        fi
      fi
    done
  }

  filter_significant_changes "$SHORT_HISTORY_CSV"
  filter_significant_changes "$LONG_HISTORY_CSV"

  # Sort and finalize regressions/improvements
  if [[ -f "${REGRESSIONS_CSV}.tmp" ]]; then
    sort -t'|' -k1 -rn "${REGRESSIONS_CSV}.tmp" | cut -d'|' -f2- >> "$REGRESSIONS_CSV"
    rm "${REGRESSIONS_CSV}.tmp"
  fi
  if [[ -f "${IMPROVEMENTS_CSV}.tmp" ]]; then
    sort -t'|' -k1 -n "${IMPROVEMENTS_CSV}.tmp" | cut -d'|' -f2- >> "$IMPROVEMENTS_CSV"
    rm "${IMPROVEMENTS_CSV}.tmp"
  fi

  # Build markdown tables for README
  local LATEST_SHORT_TABLE="$MD_TABLE_HEADER"
  if [[ -f "$SHORT_HISTORY_CSV" ]]; then
    while IFS= read -r line; do
      [[ -n "$line" ]] && LATEST_SHORT_TABLE+=$'\n'"$(csv_to_md_row "$line")"
    done < <(tail -n +2 "$SHORT_HISTORY_CSV" | head -5)
  fi

  local LATEST_LONG_TABLE="$MD_TABLE_HEADER"
  if [[ -f "$LONG_HISTORY_CSV" ]]; then
    while IFS= read -r line; do
      [[ -n "$line" ]] && LATEST_LONG_TABLE+=$'\n'"$(csv_to_md_row "$line")"
    done < <(tail -n +2 "$LONG_HISTORY_CSV" | head -5)
  fi

  # Combined performance changes table: both short and long, merged and sorted chronologically
  local perf_tmp
  perf_tmp=$(mktemp)
  for csv_file in "$SHORT_HISTORY_CSV" "$LONG_HISTORY_CSV"; do
    [[ -f "$csv_file" ]] || continue
    tail -n +2 "$csv_file" | while IFS= read -r line; do
      time_delta=$(echo "$line" | grep -o '"[^"]*"' | tr -d '"')
      percentage=$(echo "$time_delta" | grep -oE '[-]?[0-9]+\.[0-9]+%' | tr -d '%')

      if [[ -n "$percentage" ]]; then
        abs_pct=$(awk -v p="$percentage" 'BEGIN { print (p<0) ? -p : p }')
        is_sig=$(awk -v p="$abs_pct" 'BEGIN { print (p>1) ? 1 : 0 }')

        if [[ "$is_sig" == "1" ]]; then
          echo "$line" >> "$perf_tmp"
        fi
      fi
    done
  done

  local PERFORMANCE_CHANGES_TABLE="$MD_TABLE_HEADER"
  while IFS= read -r line; do
    [[ -n "$line" ]] && PERFORMANCE_CHANGES_TABLE+=$'\n'"$(csv_to_md_row "$line")"
  done < <(sort -t',' -k1 -r "$perf_tmp")
  rm -f "$perf_tmp"

  export LATEST_SHORT_TABLE LATEST_LONG_TABLE PERFORMANCE_CHANGES_TABLE
  envsubst < "${README_TEMPLATE_PATH}" > "${README_FILE_PATH}"
  echo "Generated: ${README_FILE_PATH}"
}

setup_python_venv() {
  if [[ ! -d "$VENV_DIR" ]]; then
    echo "Creating virtual environment..."
    python3 -m venv "$VENV_DIR"
    # shellcheck source=/dev/null
    source "$VENV_DIR/bin/activate"
    pip install --quiet --upgrade pip
    pip install --quiet -r "${NIMBUS_ETH1_BENCHMARKS_REPO}/requirements.txt"
    deactivate
  fi
}

generate_graphs() {
  echo "Generating graph..."
  "${VENV_DIR}/bin/python" "${NIMBUS_ETH1_BENCHMARKS_REPO}/generate_graphs.py"
}

echo "Starting README regeneration..."

if [[ "$SYNC_MODE" == "true" ]]; then
  purge_stale_entries "short" "$SHORT_HISTORY_CSV"
  purge_stale_entries "long" "$LONG_HISTORY_CSV"
fi

process_new_benchmarks "short" "$SHORT_HISTORY_CSV"
process_new_benchmarks "long" "$LONG_HISTORY_CSV"

regenerate_derived_files
setup_python_venv
generate_graphs

echo "Done!"