Skip to content

Commit 0e389fa

Browse files
committed
Antispambot: Obscure multi-byte addresses when provided.
1 parent 8610706 commit 0e389fa

4 files changed

Lines changed: 102 additions & 10 deletions

File tree

src/wp-includes/compat-utf8.php

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,66 @@ function _wp_has_noncharacters_fallback( string $text ): bool {
425425
return $has_noncharacters;
426426
}
427427

428+
/**
429+
* Get Unicode code point of character
430+
*
431+
* This is a polyfill for {@see \mb_ord()}.
432+
*
433+
* @since {WP_VERSION}
434+
*
435+
* @ignore
436+
*
437+
* @param string $string Return the Unicode code point number for the first UTF-8 character in this string.
438+
* @param ?string $encoding Must be "UTF-8" if provided, else omitted.
439+
* @return int|false Code point if able to decode the first character from the string, else false.
440+
*/
441+
function _wp_mb_ord( $string, $encoding = null ) {
442+
if ( isset( $encoding ) && ! is_utf8_charset( $encoding ) ) {
443+
return false;
444+
}
445+
446+
$at = 0;
447+
$invalid_length = 0;
448+
$count = _wp_scan_utf8( $string, $at, $invalid_length, null, 1 );
449+
450+
// Beyond this check, all relevant bytes are well-formed.
451+
if ( 1 !== $count ) {
452+
return false;
453+
}
454+
455+
switch ( $at ) {
456+
case 1:
457+
return ord( $string[ $at ] );
458+
459+
case 2:
460+
$byte1 = ord( $string[ $at ] );
461+
$byte2 = ord( $string[ $at + 1 ] );
462+
return ( $byte1 & 0x1F ) << 6 + ( $byte2 & 0x3F );
463+
464+
case 3:
465+
$byte1 = ord( $string[ $at ] );
466+
$byte2 = ord( $string[ $at + 1 ] );
467+
$byte3 = ord( $string[ $at + 2 ] );
468+
return (
469+
( ( $byte1 & 0x3F ) << 12 ) +
470+
( ( $byte2 & 0x3F ) << 6 ) +
471+
( $byte3 & 0x3F )
472+
);
473+
474+
case 4:
475+
$byte1 = ord( $string[ $at ] );
476+
$byte2 = ord( $string[ $at + 1 ] );
477+
$byte3 = ord( $string[ $at + 2 ] );
478+
$byte4 = ord( $string[ $at + 3 ] );
479+
return (
480+
( ( $byte1 & 0x07 ) << 18 ) +
481+
( ( $byte2 & 0x3F ) << 12 ) +
482+
( ( $byte3 & 0x3F ) << 6 ) +
483+
( $byte4 & 0x3F )
484+
);
485+
}
486+
}
487+
428488
/**
429489
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
430490
* with the deprecated function from the PHP standard library.

src/wp-includes/formatting.php

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2921,33 +2921,44 @@ function antispambot( $email_address, $hex_encoding = 0 ) {
29212921

29222922
$email_no_spam_address = '';
29232923

2924+
$has_mb_support = function_exists( 'grapheme_extract' );
29242925
$at = 0;
29252926
$next_at = 0;
29262927
$end = strlen( $email_address );
29272928
$invalid_length = 0;
29282929
while ( $at < $end ) {
2929-
if ( 0 === _wp_scan_utf8( $email_address, $next_at, $invalid_length, null, 1 ) ) {
2930-
break;
2930+
if ( $has_mb_support ) {
2931+
$character = grapheme_extract( $email_address, 1, GRAPHEME_EXTR_MAXCHARS, $at, $next_at );
2932+
if ( false === $character ) {
2933+
break;
2934+
}
2935+
} else {
2936+
if ( 0 === _wp_scan_utf8( $email_address, $next_at, $invalid_length, null, 1 ) ) {
2937+
break;
2938+
}
2939+
2940+
$character = substr( $email_address, $at, $next_at - $at );
29312941
}
29322942

2933-
$character = substr( $email_address, $at, $next_at - $at );
29342943
switch ( rand( 0, 1 + $hex_encoding ) ) {
29352944
case 0:
2936-
$code_point = mb_ord( $character );
2945+
$code_point = mb_ord( $character );
29372946
$email_no_spam_address .= "&#{$code_point};";
29382947
break;
29392948

29402949
case 1:
2941-
$email_no_spam_address .= mb_ord( $character );
2950+
$email_no_spam_address .= $character;
29422951
break;
29432952

29442953
case 2:
29452954
for ( $i = 0, $byte_count = strlen( $character ); $i < $byte_count; $i++ ) {
2946-
$hex_value = bin2hex( $character );
2955+
$hex_value = bin2hex( $character );
29472956
$email_no_spam_address .= "%{$hex_value}";
29482957
}
29492958
break;
29502959
}
2960+
2961+
$at = $next_at;
29512962
}
29522963

29532964
return str_replace( '@', '&#64;', $email_no_spam_address );

src/wp-includes/utf8.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,20 @@ function wp_is_valid_utf8( string $string ): bool {
5353
}
5454
endif;
5555

56+
if ( ! extension_loaded( 'mbstring' ) ) :
57+
/**
58+
* Fallback function for getting the Unicode code point of character.
59+
*
60+
* @ignore
61+
* @private
62+
*
63+
* @since {WP_VERSION}
64+
*/
65+
function mb_ord( $string, $encoding = null ) {
66+
return _wp_mb_ord( $string, $encoding );
67+
}
68+
endif;
69+
5670
if (
5771
extension_loaded( 'mbstring' ) &&
5872
// Maximal subpart substitution introduced by php/php-src@04e59c916f12b322ac55f22314e31bd0176d01cb.

tests/phpunit/tests/formatting/antispambot.php

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ public function data_returns_valid_utf8() {
3535
'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
3636
'short address' => array( 'a@b.co' ),
3737
'weird but legal dots' => array( '..@example.com' ),
38+
'umlauts' => array( 'bücher@gmx.de' ),
39+
'three-byte UTF-8' => array( "\u{FFFD}@who.knows.com" ),
3840
);
3941
}
4042

@@ -62,12 +64,17 @@ public function test_antispambot_obfuscates( $provided ) {
6264
/**
6365
* Data provider.
6466
*
65-
* @return array[]
67+
* @return Generator
6668
*/
6769
public function data_antispambot_obfuscates() {
68-
return array(
69-
array( 'example@example.com' ),
70-
array( '#@example.com' ),
70+
$addresses = array(
71+
'example@example.com',
72+
'#@example.com',
73+
'πετρος@example.com',
7174
);
75+
76+
foreach ( $addresses as $address ) {
77+
yield $address => array( $address );
78+
}
7279
}
7380
}

0 commit comments

Comments
 (0)