File Html2Text/Html2Text.php

  1: <?php
  2: /*
  3:  * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
  4:  *
  5:  * This script is free software; you can redistribute it and/or modify
  6:  * it under the terms of the GNU General Public License as published by
  7:  * the Free Software Foundation; either version 2 of the License, or
  8:  * (at your option) any later version.
  9:  *
 10:  * The GNU General Public License can be found at
 11:  * http://www.gnu.org/copyleft/gpl.html.
 12:  *
 13:  * This script is distributed in the hope that it will be useful,
 14:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 15:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 16:  * GNU General Public License for more details.
 17:  */
 18: namespace Html2Text;
 19: class Html2Text
 20: {
 21:     const ENCODING = 'UTF-8';
 22:     /**
 23:      * Contains the HTML content to convert.
 24:      *
 25:      * @type string
 26:      */
 27:     protected $html;
 28:     /**
 29:      * Contains the converted, formatted text.
 30:      *
 31:      * @type string
 32:      */
 33:     protected $text;
 34:     /**
 35:      * List of preg* regular expression patterns to search for,
 36:      * used in conjunction with $replace.
 37:      *
 38:      * @type array
 39:      * @see $replace
 40:      */
 41:     protected $search = array(
 42:         "/\r/",                                           // Non-legal carriage return
 43:         "/[\n\t]+/",                                      // Newlines and tabs
 44:         '/<head[^>]*>.*?<\/head>/i',                      // <head>
 45:         '/<script[^>]*>.*?<\/script>/i',                  // <script>s -- which strip_tags supposedly has problems with
 46:         '/<style[^>]*>.*?<\/style>/i',                    // <style>s -- which strip_tags supposedly has problems with
 47:         '/<p[^>]*>/i',                                    // <P>
 48:         '/<br[^>]*>/i',                                   // <br>
 49:         '/<i[^>]*>(.*?)<\/i>/i',                          // <i>
 50:         '/<em[^>]*>(.*?)<\/em>/i',                        // <em>
 51:         '/(<ul[^>]*>|<\/ul>)/i',                          // <ul> and </ul>
 52:         '/(<ol[^>]*>|<\/ol>)/i',                          // <ol> and </ol>
 53:         '/(<dl[^>]*>|<\/dl>)/i',                          // <dl> and </dl>
 54:         '/<li[^>]*>(.*?)<\/li>/i',                        // <li> and </li>
 55:         '/<dd[^>]*>(.*?)<\/dd>/i',                        // <dd> and </dd>
 56:         '/<dt[^>]*>(.*?)<\/dt>/i',                        // <dt> and </dt>
 57:         '/<li[^>]*>/i',                                   // <li>
 58:         '/<hr[^>]*>/i',                                   // <hr>
 59:         '/<div[^>]*>/i',                                  // <div>
 60:         '/(<table[^>]*>|<\/table>)/i',                    // <table> and </table>
 61:         '/(<tr[^>]*>|<\/tr>)/i',                          // <tr> and </tr>
 62:         '/<td[^>]*>(.*?)<\/td>/i',                        // <td> and </td>
 63:         '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
 64:     );
 65:     /**
 66:      * List of pattern replacements corresponding to patterns searched.
 67:      *
 68:      * @type array
 69:      * @see $search
 70:      */
 71:     protected $replace = array(
 72:         '',                              // Non-legal carriage return
 73:         ' ',                             // Newlines and tabs
 74:         '',                              // <head>
 75:         '',                              // <script>s -- which strip_tags supposedly has problems with
 76:         '',                              // <style>s -- which strip_tags supposedly has problems with
 77:         "\n\n",                          // <P>
 78:         "\n",                            // <br>
 79:         '_\\1_',                         // <i>
 80:         '_\\1_',                         // <em>
 81:         "\n\n",                          // <ul> and </ul>
 82:         "\n\n",                          // <ol> and </ol>
 83:         "\n\n",                          // <dl> and </dl>
 84:         "\t* \\1\n",                     // <li> and </li>
 85:         " \\1\n",                        // <dd> and </dd>
 86:         "\t* \\1",                       // <dt> and </dt>
 87:         "\n\t* ",                        // <li>
 88:         "\n-------------------------\n", // <hr>
 89:         "<div>\n",                       // <div>
 90:         "\n\n",                          // <table> and </table>
 91:         "\n",                            // <tr> and </tr>
 92:         "\t\t\\1\n",                     // <td> and </td>
 93:         ""                               // <span class="_html2text_ignore">...</span>
 94:     );
 95:     /**
 96:      * List of preg* regular expression patterns to search for,
 97:      * used in conjunction with $entReplace.
 98:      *
 99:      * @type array
100:      * @see $entReplace
101:      */
102:     protected $entSearch = array(
103:         '/&#153;/i',                                     // TM symbol in win-1252
104:         '/&#151;/i',                                     // m-dash in win-1252
105:         '/&(amp|#38);/i',                                // Ampersand: see converter()
106:         '/[ ]{2,}/',                                     // Runs of spaces, post-handling
107:     );
108:     /**
109:      * List of pattern replacements corresponding to patterns searched.
110:      *
111:      * @type array
112:      * @see $entSearch
113:      */
114:     protected $entReplace = array(
115:         '™',         // TM symbol
116:         '—',         // m-dash
117:         '|+|amp|+|', // Ampersand: see converter()
118:         ' ',         // Runs of spaces, post-handling
119:     );
120:     /**
121:      * List of preg* regular expression patterns to search for
122:      * and replace using callback function.
123:      *
124:      * @type array
125:      */
126:     protected $callbackSearch = array(
127:         '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
128:         '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
129:         '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
130:         '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
131:         '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
132:     );
133:     /**
134:      * List of preg* regular expression patterns to search for in PRE body,
135:      * used in conjunction with $preReplace.
136:      *
137:      * @type array
138:      * @see $preReplace
139:      */
140:     protected $preSearch = array(
141:         "/\n/",
142:         "/\t/",
143:         '/ /',
144:         '/<pre[^>]*>/',
145:         '/<\/pre>/'
146:     );
147:     /**
148:      * List of pattern replacements corresponding to patterns searched for PRE body.
149:      *
150:      * @type array
151:      * @see $preSearch
152:      */
153:     protected $preReplace = array(
154:         '<br>',
155:         '&nbsp;&nbsp;&nbsp;&nbsp;',
156:         '&nbsp;',
157:         '',
158:         '',
159:     );
160:     /**
161:      * Temporary workspace used during PRE processing.
162:      *
163:      * @type string
164:      */
165:     protected $preContent = '';
166:     /**
167:      * Contains the base URL that relative links should resolve to.
168:      *
169:      * @type string
170:      */
171:     protected $baseurl = '';
172:     /**
173:      * Indicates whether content in the $html variable has been converted yet.
174:      *
175:      * @type boolean
176:      * @see $html, $text
177:      */
178:     protected $converted = false;
179:     /**
180:      * Contains URL addresses from links to be rendered in plain text.
181:      *
182:      * @type array
183:      * @see buildlinkList()
184:      */
185:     protected $linkList = array();
186:     /**
187:      * Various configuration options (able to be set in the constructor)
188:      *
189:      * @type array
190:      */
191:     protected $options = array(
192:         'do_links' => 'inline', // 'none'
193:                                 // 'inline' (show links inline)
194:                                 // 'nextline' (show links on the next line)
195:                                 // 'table' (if a table of link URLs should be listed after the text.
196:         'width' => 70,          //  Maximum width of the formatted text, in columns.
197:                                 //  Set this value to 0 (or less) to ignore word wrapping
198:                                 //  and not constrain text to a fixed-width column.
199:     );
200:     private function legacyConstruct($html = '', $fromFile = false, array $options = array())
201:     {
202:         $this->set_html($html, $fromFile);
203:         $this->options = array_merge($this->options, $options);
204:     }
205:     /**
206:      * @param string $html    Source HTML
207:      * @param array  $options Set configuration options
208:      */
209:     public function __construct($html = '', $options = array())
210:     {
211:         // for backwards compatibility
212:         if (!is_array($options)) {
213:             return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
214:         }
215:         $this->html = $html;
216:         $this->options = array_merge($this->options, $options);
217:     }
218:     /**
219:      * Set the source HTML
220:      *
221:      * @param string $html HTML source content
222:      */
223:     public function setHtml($html)
224:     {
225:         $this->html = $html;
226:         $this->converted = false;
227:     }
228:     /**
229:      * @deprecated
230:      */
231:     public function set_html($html, $from_file = false)
232:     {
233:         if ($from_file) {
234:             throw new \InvalidArgumentException("Argument from_file no longer supported");
235:         }
236:         return $this->setHtml($html);
237:     }
238:     /**
239:      * Returns the text, converted from HTML.
240:      *
241:      * @return string
242:      */
243:     public function getText()
244:     {
245:         if (!$this->converted) {
246:             $this->convert();
247:         }
248:         return $this->text;
249:     }
250:     /**
251:      * @deprecated
252:      */
253:     public function get_text()
254:     {
255:         return $this->getText();
256:     }
257:     /**
258:      * @deprecated
259:      */
260:     public function print_text()
261:     {
262:         print $this->getText();
263:     }
264:     /**
265:      * @deprecated
266:      */
267:     public function p()
268:     {
269:         return $this->print_text();
270:     }
271:     /**
272:      * Sets a base URL to handle relative links.
273:      *
274:      * @param string $baseurl
275:      */
276:     public function setBaseUrl($baseurl)
277:     {
278:         $this->baseurl = $baseurl;
279:     }
280:     /**
281:      * @deprecated
282:      */
283:     public function set_base_url($baseurl)
284:     {
285:         return $this->setBaseUrl($baseurl);
286:     }
287:     protected function convert()
288:     {
289:         $this->linkList = array();
290:         $text = trim(stripslashes($this->html));
291:         $this->converter($text);
292:         if ($this->linkList) {
293:             $text .= "\n\nLinks:\n------\n";
294:             foreach ($this->linkList as $i => $url) {
295:                 $text .= '[' . ($i + 1) . '] ' . $url . "\n";
296:             }
297:         }
298:         $this->text = $text;
299:         $this->converted = true;
300:     }
301:     protected function converter(&$text)
302:     {
303:         $this->convertBlockquotes($text);
304:         $this->convertPre($text);
305:         $text = preg_replace($this->search, $this->replace, $text);
306:         $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
307:         $text = strip_tags($text);
308:         $text = preg_replace($this->entSearch, $this->entReplace, $text);
309:         $text = html_entity_decode($text, ENT_QUOTES, self::ENCODING);
310:         // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
311:         $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
312:         // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
313:         // This properly handles situation of "&amp;quot;" in input string
314:         $text = str_replace('|+|amp|+|', '&', $text);
315:         // Normalise empty lines
316:         $text = preg_replace("/\n\s+\n/", "\n\n", $text);
317:         $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
318:         // remove leading empty lines (can be produced by eg. P tag on the beginning)
319:         $text = ltrim($text, "\n");
320:         if ($this->options['width'] > 0) {
321:             $text = wordwrap($text, $this->options['width']);
322:         }
323:     }
324:     /**
325:      * Helper function called by preg_replace() on link replacement.
326:      *
327:      * Maintains an internal list of links to be displayed at the end of the
328:      * text, with numeric indices to the original point in the text they
329:      * appeared. Also makes an effort at identifying and handling absolute
330:      * and relative links.
331:      *
332:      * @param  string $link          URL of the link
333:      * @param  string $display       Part of the text to associate number with
334:      * @param  null   $linkOverride
335:      * @return string
336:      */
337:     protected function buildlinkList($link, $display, $linkOverride = null)
338:     {
339:         $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
340:         if ($linkMethod == 'none') {
341:             return $display;
342:         }
343:         // Ignored link types
344:         if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
345:             return $display;
346:         }
347:         if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
348:             $url = $link;
349:         } else {
350:             $url = $this->baseurl;
351:             if (substr($link, 0, 1) != '/') {
352:                 $url .= '/';
353:             }
354:             $url .= $link;
355:         }
356:         if ($linkMethod == 'table') {
357:             if (($index = array_search($url, $this->linkList)) === false) {
358:                 $index = count($this->linkList);
359:                 $this->linkList[] = $url;
360:             }
361:             return $display . ' [' . ($index + 1) . ']';
362:         } elseif ($linkMethod == 'nextline') {
363:             return $display . "\n[" . $url . ']';
364:         } else { // link_method defaults to inline
365:             return $display . ' [' . $url . ']';
366:         }
367:     }
368:     protected function convertPre(&$text)
369:     {
370:         // get the content of PRE element
371:         while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
372:             $this->preContent = $matches[1];
373:             // Run our defined tags search-and-replace with callback
374:             $this->preContent = preg_replace_callback(
375:                 $this->callbackSearch,
376:                 array($this, 'pregCallback'),
377:                 $this->preContent
378:             );
379:             // convert the content
380:             $this->preContent = sprintf(
381:                 '<div><br>%s<br></div>',
382:                 preg_replace($this->preSearch, $this->preReplace, $this->preContent)
383:             );
384:             // replace the content (use callback because content can contain $0 variable)
385:             $text = preg_replace_callback(
386:                 '/<pre[^>]*>.*<\/pre>/ismU',
387:                 array($this, 'pregPreCallback'),
388:                 $text,
389:                 1
390:             );
391:             // free memory
392:             $this->preContent = '';
393:         }
394:     }
395:     /**
396:      * Helper function for BLOCKQUOTE body conversion.
397:      *
398:      * @param string $text HTML content
399:      */
400:     protected function convertBlockquotes(&$text)
401:     {
402:         if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
403:             $start = 0;
404:             $taglen = 0;
405:             $level = 0;
406:             $diff = 0;
407:             foreach ($matches[0] as $m) {
408:                 if ($m[0][0] == '<' && $m[0][1] == '/') {
409:                     $level--;
410:                     if ($level < 0) {
411:                         $level = 0; // malformed HTML: go to next blockquote
412:                     } elseif ($level > 0) {
413:                         // skip inner blockquote
414:                     } else {
415:                         $end = $m[1];
416:                         $len = $end - $taglen - $start;
417:                         // Get blockquote content
418:                         $body = substr($text, $start + $taglen - $diff, $len);
419:                         // Set text width
420:                         $pWidth = $this->options['width'];
421:                         if ($this->options['width'] > 0) $this->options['width'] -= 2;
422:                         // Convert blockquote content
423:                         $body = trim($body);
424:                         $this->converter($body);
425:                         // Add citation markers and create PRE block
426:                         $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
427:                         $body = '<pre>' . htmlspecialchars($body) . '</pre>';
428:                         // Re-set text width
429:                         $this->options['width'] = $pWidth;
430:                         // Replace content
431:                         $text = substr($text, 0, $start - $diff)
432:                             . $body . substr($text, $end + strlen($m[0]) - $diff);
433:                         $diff = $len + $taglen + strlen($m[0]) - strlen($body);
434:                         unset($body);
435:                     }
436:                 } else {
437:                     if ($level == 0) {
438:                         $start = $m[1];
439:                         $taglen = strlen($m[0]);
440:                     }
441:                     $level++;
442:                 }
443:             }
444:         }
445:     }
446:     /**
447:      * Callback function for preg_replace_callback use.
448:      *
449:      * @param  array  $matches PREG matches
450:      * @return string
451:      */
452:     protected function pregCallback($matches)
453:     {
454:         switch (strtolower($matches[1])) {
455:             case 'b':
456:             case 'strong':
457:                 return $this->toupper($matches[3]);
458:             case 'th':
459:                 return $this->toupper("\t\t" . $matches[3] . "\n");
460:             case 'h':
461:                 return $this->toupper("\n\n" . $matches[3] . "\n\n");
462:             case 'a':
463:                 // override the link method
464:                 $linkOverride = null;
465:                 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
466:                     $linkOverride = $linkOverrideMatch[1];
467:                 }
468:                 // Remove spaces in URL (#1487805)
469:                 $url = str_replace(' ', '', $matches[3]);
470:                 return $this->buildlinkList($url, $matches[5], $linkOverride);
471:         }
472:         return '';
473:     }
474:     /**
475:      * Callback function for preg_replace_callback use in PRE content handler.
476:      *
477:      * @param  array  $matches PREG matches
478:      * @return string
479:      */
480:     protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
481:     {
482:         return $this->preContent;
483:     }
484:     /**
485:      * Strtoupper function with HTML tags and entities handling.
486:      *
487:      * @param  string $str Text to convert
488:      * @return string Converted text
489:      */
490:     private function toupper($str)
491:     {
492:         // string can contain HTML tags
493:         $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
494:         // convert toupper only the text between HTML tags
495:         foreach ($chunks as $i => $chunk) {
496:             if ($chunk[0] != '<') {
497:                 $chunks[$i] = $this->strtoupper($chunk);
498:             }
499:         }
500:         return implode($chunks);
501:     }
502:     /**
503:      * Strtoupper multibyte wrapper function with HTML entities handling.
504:      *
505:      * @param  string $str Text to convert
506:      * @return string Converted text
507:      */
508:     private function strtoupper($str)
509:     {
510:         $str = html_entity_decode($str, ENT_COMPAT, self::ENCODING);
511:         if (function_exists('mb_strtoupper')) {
512:             $str = mb_strtoupper($str, self::ENCODING);
513:         } else {
514:             $str = strtoupper($str);
515:         }
516:         $str = htmlspecialchars($str, ENT_COMPAT, self::ENCODING);
517:         return $str;
518:     }
519: }
520:
Namespaces

Classes

Exceptions