1010use DOMNode ;
1111use DOMProcessingInstruction ;
1212use DOMText ;
13- use Symfony \Polyfill \Mbstring \Mbstring ;
1413
1514class Html2Text
1615{
@@ -20,12 +19,12 @@ public static function convert(string $html, Config $config = new Config()): str
2019
2120 if ($ isOfficeDocument ) {
2221 // remove office namespace
23- $ html = str_replace ([" <o:p> " , " </o:p> " ], '' , $ html );
22+ $ html = str_replace ([' <o:p> ' , ' </o:p> ' ], '' , $ html );
2423 }
2524
2625 $ html = static ::fixNewlines ($ html );
2726 if (mb_detect_encoding ($ html , 'UTF-8 ' , true )) {
28- $ html = Mbstring:: mb_convert_encoding ($ html , 'HTML-ENTITIES ' , 'UTF-8 ' );
27+ $ html = mb_convert_encoding ($ html , 'HTML-ENTITIES ' , 'UTF-8 ' );
2928 }
3029
3130 $ doc = static ::getDocument ($ html );
@@ -130,7 +129,7 @@ public static function getDocument(string $html): DOMDocument
130129
131130 public static function isOfficeDocument (string $ html ): bool
132131 {
133- return str_contains ($ html , " urn:schemas-microsoft-com:office " );
132+ return str_contains ($ html , ' urn:schemas-microsoft-com:office ' );
134133 }
135134
136135 /**
@@ -144,6 +143,7 @@ public static function isOfficeDocument(string $html): bool
144143 public static function renderText (string $ text ): string
145144 {
146145 $ text = str_replace (static ::nbspCodes (), ' ' , $ text );
146+
147147 return str_replace (static ::zwnjCodes (), '' , $ text );
148148 }
149149
@@ -152,7 +152,7 @@ public static function isWhitespace(string $text): bool
152152 return strlen (trim (static ::renderText ($ text ), "\n\r\t " )) === 0 ;
153153 }
154154
155- public static function nextChildName (DOMNode $ node ): ? string
155+ public static function nextChildName (DOMNode $ node ): string | null
156156 {
157157 // get the next child
158158 $ nextNode = $ node ->nextSibling ;
@@ -197,7 +197,7 @@ public static function iterateOverNode(
197197 return str_replace ("\n" , "\r" , $ text );
198198 } else {
199199 $ text = static ::renderText ($ node ->wholeText );
200- $ text = (string ) preg_replace (" /[ \\t \\n \\f \\r ]+/im " , " " , $ text );
200+ $ text = (string ) preg_replace (' /[ \\t \\n \\f \\r ]+/im ' , ' ' , $ text );
201201
202202 if (! static ::isWhitespace ($ text ) && ($ prevName == 'p ' || $ prevName == 'div ' )) {
203203 return "\n" . $ text ;
@@ -221,77 +221,86 @@ public static function iterateOverNode(
221221
222222 // start whitespace
223223 switch ($ name ) {
224- case " hr " :
224+ case ' hr ' :
225225 $ prefix = '' ;
226226 if ($ prevName != null ) {
227227 $ prefix = "\n" ;
228228 }
229+
229230 return $ prefix . "--------------------------------------------------------------- \n" ;
230231
231- case " style " :
232- case " head " :
233- case " title " :
234- case " meta " :
235- case " script " :
232+ case ' style ' :
233+ case ' head ' :
234+ case ' title ' :
235+ case ' meta ' :
236+ case ' script ' :
236237 // ignore these tags
237- return "" ;
238-
239- case " h1 " :
240- case " h2 " :
241- case " h3 " :
242- case " h4 " :
243- case " h5 " :
244- case " h6 " :
245- case " ol " :
246- case " ul " :
247- case " pre " :
238+ return '' ;
239+
240+ case ' h1 ' :
241+ case ' h2 ' :
242+ case ' h3 ' :
243+ case ' h4 ' :
244+ case ' h5 ' :
245+ case ' h6 ' :
246+ case ' ol ' :
247+ case ' ul ' :
248+ case ' pre ' :
248249 // add two newlines
249250 $ output = "\n\n" ;
251+
250252 break ;
251253
252- case " td " :
253- case " th " :
254+ case ' td ' :
255+ case ' th ' :
254256 // add tab char to separate table fields
255257 $ output = "\t" ;
258+
256259 break ;
257260
258- case " p " :
261+ case ' p ' :
259262 // Microsoft exchange emails often include HTML which, when passed through
260263 // html2text, results in lots of double line returns everywhere.
261264 //
262265 // To fix this, for any p element with a className of `MsoNormal` (the standard
263266 // classname in any Microsoft export or outlook for a paragraph that behaves
264267 // like a line return) we skip the first line returns and set the name to br.
265268 if ($ isOfficeDocument && $ node ->getAttribute ('class ' ) == 'MsoNormal ' ) {
266- $ output = "" ;
269+ $ output = '' ;
267270 $ name = 'br ' ;
271+
268272 break ;
269273 }
270274
271275 // add two lines
272276 $ output = "\n\n" ;
277+
273278 break ;
274279
275- case " tr " :
280+ case ' tr ' :
276281 // add one line
277282 $ output = "\n" ;
283+
278284 break ;
279285
280- case " div " :
281- $ output = "" ;
286+ case ' div ' :
287+ $ output = '' ;
282288 if ($ prevName !== null ) {
283289 // add one line
284290 $ output .= "\n" ;
285291 }
292+
286293 break ;
287294
288- case "li " :
289- $ output = "- " ;
295+ case 'li ' :
296+ $ output = '- ' ;
297+
290298 break ;
291299
292300 default :
293301 // print out contents of unknown tags
294- $ output = "" ;
302+ $ output = '' ;
303+
295304 break ;
296305 }
297306
@@ -349,45 +358,47 @@ public static function iterateOverNode(
349358
350359 // end whitespace
351360 switch ($ name ) {
352- case " h1 " :
353- case " h2 " :
354- case " h3 " :
355- case " h4 " :
356- case " h5 " :
357- case " h6 " :
358- case " pre " :
359- case " p " :
361+ case ' h1 ' :
362+ case ' h2 ' :
363+ case ' h3 ' :
364+ case ' h4 ' :
365+ case ' h5 ' :
366+ case ' h6 ' :
367+ case ' pre ' :
368+ case ' p ' :
360369 // add two lines
361370 $ output .= "\n\n" ;
371+
362372 break ;
363373
364- case " br " :
374+ case ' br ' :
365375 // add one line
366376 $ output .= "\n" ;
377+
367378 break ;
368379
369- case " div " :
380+ case ' div ' :
370381 break ;
371382
372- case " a " :
383+ case ' a ' :
373384 // links are returned in [text](link) format
374- $ href = $ node ->getAttribute (" href " );
385+ $ href = $ node ->getAttribute (' href ' );
375386
376387 $ output = trim ($ output );
377388
378389 // remove double [[ ]] s from linking images
379- if (str_starts_with ($ output , " [ " ) && str_ends_with ($ output , " ] " )) {
390+ if (str_starts_with ($ output , ' [ ' ) && str_ends_with ($ output , ' ] ' )) {
380391 $ output = substr ($ output , 1 , strlen ($ output ) - 2 );
381392
382393 // for linking images, the title of the <a> overrides the title of the <img>
383- if ($ node ->getAttribute (" title " )) {
384- $ output = $ node ->getAttribute (" title " );
394+ if ($ node ->getAttribute (' title ' )) {
395+ $ output = $ node ->getAttribute (' title ' );
385396 }
386397 }
387398
388399 // if there is no link text, but a title attr
389- if (! $ output && $ node ->getAttribute (" title " )) {
390- $ output = $ node ->getAttribute (" title " );
400+ if (! $ output && $ node ->getAttribute (' title ' )) {
401+ $ output = $ node ->getAttribute (' title ' );
391402 }
392403
393404 if ($ href == null ) {
@@ -417,32 +428,36 @@ public static function iterateOverNode(
417428
418429 // does the next node require additional whitespace?
419430 switch ($ nextName ) {
420- case " h1 " :
421- case " h2 " :
422- case " h3 " :
423- case " h4 " :
424- case " h5 " :
425- case " h6 " :
431+ case ' h1 ' :
432+ case ' h2 ' :
433+ case ' h3 ' :
434+ case ' h4 ' :
435+ case ' h5 ' :
436+ case ' h6 ' :
426437 $ output .= "\n" ;
438+
427439 break ;
428440 }
441+
429442 break ;
430443
431- case " img " :
432- if ($ node ->getAttribute (" title " )) {
433- $ output = " [ " . $ node ->getAttribute (" title " ) . " ] " ;
434- } elseif ($ node ->getAttribute (" alt " )) {
435- $ output = " [ " . $ node ->getAttribute (" alt " ) . " ] " ;
444+ case ' img ' :
445+ if ($ node ->getAttribute (' title ' )) {
446+ $ output = ' [ ' . $ node ->getAttribute (' title ' ) . ' ] ' ;
447+ } elseif ($ node ->getAttribute (' alt ' )) {
448+ $ output = ' [ ' . $ node ->getAttribute (' alt ' ) . ' ] ' ;
436449 } else {
437- $ output = "" ;
450+ $ output = '' ;
438451 }
452+
439453 break ;
440454
441- case " li " :
455+ case ' li ' :
442456 $ output .= "\n" ;
457+
443458 break ;
444459
445- case " blockquote " :
460+ case ' blockquote ' :
446461 // process quoted text for whitespace/newlines
447462 $ output = static ::processWhitespaceNewlines ($ output );
448463
@@ -457,6 +472,7 @@ public static function iterateOverNode(
457472
458473 // add another leading newline and trailing newlines
459474 $ output = "\n" . $ output . "\n\n" ;
475+
460476 break ;
461477 default :
462478 // do nothing
0 commit comments