Skip to content

Commit 57cf99f

Browse files
committed
Flush FPM logs in case of timeouts
When Lambda times out with the PHP-FPM layer, the logs written by the PHP script are never flushed to stderr by PHP-FPM. That means they never reach CloudWatch, which makes timeouts really hard to debug. With this change, Bref waits for the FPM response until 1 second before the actual Lambda timeout (via a connection timeout on the FastCGI connection). If Bref reaches that point, it will ask PHP-FPM to gracefully restart the PHP-FPM worker, which: - flushes the logs (logs end up in CloudWatch, which is great) - restarts a clean FPM worker, without doing a full FPM restart (which may take longer) Follow up of #770, #772, #895 May address some of #862 Note: this does not change anything for the Function layer (only affects FPM). Also this does not show a full stack track of the place in the code where the timeout happens (#895 did). Still it's an improvement over the current status.
1 parent 54c43ba commit 57cf99f

File tree

6 files changed

+73
-8
lines changed

6 files changed

+73
-8
lines changed

demo/http.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
require __DIR__ . '/../vendor/autoload.php';
44

55
if (isset($_GET['sleep'])) {
6+
error_log('This is a log');
67
sleep(10);
78
}
89

serverless.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ functions:
2525
handler: demo/function.php
2626
description: 'Bref CLI demo'
2727
layers:
28-
- ${bref:layer.php-74}
28+
- ${bref:layer.php-80}
2929
environment:
3030
BREF_LOOP_MAX: 100
3131

@@ -34,7 +34,7 @@ functions:
3434
description: 'Bref HTTP demo'
3535
timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds)
3636
layers:
37-
- ${bref:layer.php-74-fpm}
37+
- ${bref:layer.php-80-fpm}
3838
events:
3939
- http: 'ANY /'
4040

@@ -43,7 +43,7 @@ functions:
4343
description: 'Bref HTTP demo with a PSR-7 handler'
4444
timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds)
4545
layers:
46-
- ${bref:layer.php-74}
46+
- ${bref:layer.php-80}
4747
events:
4848
- http: 'ANY /psr7'
4949
- httpApi: 'GET /psr7'
@@ -55,7 +55,7 @@ functions:
5555
description: 'Bref HTTP demo'
5656
timeout: 5 # in seconds (API Gateway has a timeout of 29 seconds)
5757
layers:
58-
- ${bref:layer.php-74-fpm}
58+
- ${bref:layer.php-80-fpm}
5959
events:
6060
- httpApi: '*'
6161

@@ -64,5 +64,5 @@ functions:
6464
description: 'Bref console command demo'
6565
timeout: 900
6666
layers:
67-
- ${bref:layer.php-74}
67+
- ${bref:layer.php-80}
6868
- ${bref:layer.console}

src/Event/Http/FastCgi/Timeout.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php declare(strict_types=1);
2+
3+
namespace Bref\Event\Http\FastCgi;
4+
5+
/**
6+
* There was a timeout while processing the PHP request
7+
*
8+
* @internal
9+
*/
10+
final class Timeout extends \Exception
11+
{
12+
public function __construct(int $taskTimeoutInMs)
13+
{
14+
$message = "The request timed out after $taskTimeoutInMs ms. "
15+
. "Note: that duration may be lower than the Lambda timeout, don't be surprised, that is intentional. "
16+
. "Indeed, Bref stops the PHP-FPM request *before* a hard Lambda timeout, because a hard timeout prevents all logs to be written to CloudWatch.";
17+
18+
parent::__construct($message);
19+
}
20+
}

src/Event/Http/FpmHandler.php

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
use Bref\Context\Context;
66
use Bref\Event\Http\FastCgi\FastCgiCommunicationFailed;
77
use Bref\Event\Http\FastCgi\FastCgiRequest;
8+
use Bref\Event\Http\FastCgi\Timeout;
89
use Exception;
910
use hollodotme\FastCGI\Client;
11+
use hollodotme\FastCGI\Exceptions\TimedoutException;
1012
use hollodotme\FastCGI\Interfaces\ProvidesRequestData;
1113
use hollodotme\FastCGI\Interfaces\ProvidesResponseData;
1214
use hollodotme\FastCGI\SocketConnections\UnixDomainSocket;
@@ -110,8 +112,28 @@ public function handleRequest(HttpRequestEvent $event, Context $context): HttpRe
110112
{
111113
$request = $this->eventToFastCgiRequest($event, $context);
112114

115+
// The script will timeout 1 second before the remaining time
116+
// to allow some time for Bref/PHP-FPM to recover and cleanup
117+
$margin = 1000;
118+
$timeoutDelayInMs = max(1000, $context->getRemainingTimeInMillis() - $margin);
119+
113120
try {
114-
$response = $this->client->sendRequest($this->connection, $request);
121+
$socketId = $this->client->sendAsyncRequest($this->connection, $request);
122+
123+
$response = $this->client->readResponse($socketId, $timeoutDelayInMs);
124+
} catch (TimedoutException $e) {
125+
// Send a SIGUSR2 signal to PHP-FPM
126+
// That causes FPM to reload all workers, which allows us to cleanly stop the FPM worker that is stuck in a timeout/waiting state.
127+
// A (intentional) side-effect is that it causes all worker logs buffered by FPM to be written to stderr.
128+
// Without that, all logs written by the PHP script are never written to stderr (and thus never logged to CloudWatch).
129+
// This takes a bit of time (a few ms), but it's still faster than rebooting FPM entirely.
130+
posix_kill($this->fpm->getPid(), SIGUSR2);
131+
132+
// Throw an exception so that:
133+
// - this is reported as a Lambda execution error ("error rate" metrics are accurate)
134+
// - the CloudWatch logs correctly reflect that an execution error occurred
135+
// - the 500 response is the same as if an exception happened in Bref
136+
throw new Timeout($timeoutDelayInMs);
115137
} catch (Throwable $e) {
116138
printf(
117139
"Error communicating with PHP-FPM to read the HTTP response. A root cause of this can be that the Lambda (or PHP) timed out, for example when trying to connect to a remote API or database, if this happens continuously check for those! Bref will restart PHP-FPM now. Original exception message: %s %s\n",

tests/Handler/FpmHandlerTest.php

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
namespace Bref\Test\Handler;
44

55
use Bref\Context\Context;
6-
use Bref\Event\Http\FastCgi\FastCgiCommunicationFailed;
6+
use Bref\Event\Http\FastCgi\Timeout;
77
use Bref\Event\Http\FpmHandler;
88
use Bref\Test\HttpRequestProxyTest;
99
use DMS\PHPUnitExtensions\ArraySubset\ArraySubsetAsserts;
@@ -1071,7 +1071,7 @@ public function test FPM timeouts are recovered from()
10711071
'httpMethod' => 'GET',
10721072
], $this->fakeContext);
10731073
$this->fail('No exception was thrown');
1074-
} catch (FastCgiCommunicationFailed $e) {
1074+
} catch (Timeout $e) {
10751075
// PHP-FPM should work after that
10761076
$statusCode = $this->fpm->handle([
10771077
'version' => '1.0',
@@ -1084,6 +1084,26 @@ public function test FPM timeouts are recovered from()
10841084
}
10851085
}
10861086

1087+
/**
1088+
* See https://github.com/brefphp/bref/issues/862
1089+
*/
1090+
public function test worker logs are still written in case of a timeout()
1091+
{
1092+
$this->fpm = new FpmHandler(__DIR__ . '/PhpFpm/timeout.php', __DIR__ . '/PhpFpm/php-fpm.conf');
1093+
$this->fpm->start();
1094+
1095+
try {
1096+
$this->fpm->handle([
1097+
'version' => '1.0',
1098+
'httpMethod' => 'GET',
1099+
], new Context('abc', time(), 'abc', 'abc'));
1100+
$this->fail('No exception was thrown');
1101+
} catch (Timeout $e) {
1102+
$logs = ob_get_contents();
1103+
self::assertStringContainsString('This is a log message', $logs);
1104+
}
1105+
}
1106+
10871107
/**
10881108
* @see https://github.com/brefphp/bref/issues/316
10891109
*/

tests/Handler/PhpFpm/timeout.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
<?php declare(strict_types=1);
22

3+
error_log('This is a log message');
4+
35
sleep((int) ($_GET['timeout'] ?? 10));

0 commit comments

Comments
 (0)