Skip to content

Commit 9095fe8

Browse files
committed
Fix stale socket recovery without connect probe in client
Move stale socket detection to the daemon's bind() path: on EADDRINUSE, probe with connect() to check if a real daemon is listening. If ECONNREFUSED (stale socket from crash), unlink and retry bind. This avoids the client's connect probe which creates a real connection the daemon must accept and handle.
1 parent b1acd06 commit 9095fe8

File tree

2 files changed

+29
-25
lines changed

2 files changed

+29
-25
lines changed

ext/stoolap.c

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,31 +1246,15 @@ static int ensure_daemon_running(void)
12461246
stoolap_daemon_init_paths((parent > 1) ? parent : getpid());
12471247
}
12481248

1249-
/* Probe socket with connect() to verify daemon is alive.
1250-
* stat() alone can't distinguish a live socket from a stale one. */
1249+
/* Fast path: socket file exists → assume daemon is running.
1250+
* If it's stale (crashed daemon), proxy_connect() will fail and
1251+
* Database::open() falls through to direct mode. On next call,
1252+
* the daemon's bind() will clean up via EADDRINUSE detection.
1253+
* We don't probe with connect() because that creates a real
1254+
* connection the daemon must accept and handle. */
12511255
struct stat st;
12521256
if (stat(STOOLAP_DAEMON_SOCK, &st) == 0 && S_ISSOCK(st.st_mode)) {
1253-
int probe = socket(AF_UNIX, SOCK_STREAM, 0);
1254-
if (probe >= 0) {
1255-
struct sockaddr_un addr;
1256-
memset(&addr, 0, sizeof(addr));
1257-
addr.sun_family = AF_UNIX;
1258-
strncpy(addr.sun_path, STOOLAP_DAEMON_SOCK, sizeof(addr.sun_path) - 1);
1259-
if (connect(probe, (struct sockaddr *)&addr, sizeof(addr)) == 0) {
1260-
close(probe);
1261-
return 0; /* daemon is alive */
1262-
}
1263-
int err = errno;
1264-
close(probe);
1265-
if (err == ECONNREFUSED) {
1266-
/* No listener — stale socket from crashed daemon */
1267-
unlink(STOOLAP_DAEMON_SOCK);
1268-
} else {
1269-
/* Other error (EAGAIN, EACCES, etc.) — daemon may be alive
1270-
* but busy or inaccessible. Don't unlink. Let bind() decide. */
1271-
return 0;
1272-
}
1273-
}
1257+
return 0;
12741258
}
12751259

12761260
/* Double-fork a daemon candidate. If multiple processes race here,

ext/stoolap_daemon.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1787,14 +1787,34 @@ static int create_listen_socket(void)
17871787

17881788
if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
17891789
if (errno == EADDRINUSE) {
1790-
/* Another daemon won the race — exit silently */
1790+
/* Socket exists. Probe to see if a daemon is actually listening.
1791+
* If ECONNREFUSED → stale socket from crash, unlink and retry.
1792+
* If connect succeeds → live daemon, we lost the race. */
1793+
int probe = socket(AF_UNIX, SOCK_STREAM, 0);
1794+
if (probe >= 0) {
1795+
if (connect(probe, (struct sockaddr *)&addr, sizeof(addr)) == 0) {
1796+
close(probe); /* live daemon — we're the loser */
1797+
close(fd);
1798+
return -2;
1799+
}
1800+
int err = errno;
1801+
close(probe);
1802+
if (err == ECONNREFUSED) {
1803+
/* Stale socket — unlink and retry bind once */
1804+
unlink(STOOLAP_DAEMON_SOCK);
1805+
if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) == 0) {
1806+
goto bind_ok;
1807+
}
1808+
}
1809+
}
17911810
close(fd);
1792-
return -2; /* special: not an error, just lost the race */
1811+
return -2; /* give up — another daemon likely won the retry */
17931812
}
17941813
LOG_ERR("bind(%s) failed: %s", STOOLAP_DAEMON_SOCK, strerror(errno));
17951814
close(fd);
17961815
return -1;
17971816
}
1817+
bind_ok:
17981818

17991819
if (listen(fd, 16) < 0) {
18001820
LOG_ERR("listen() failed: %s", strerror(errno));

0 commit comments

Comments
 (0)