1: <?php
2: /*
3: * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
4: *
5: * This script is free software; you can redistribute it and/or modify
6: * it under the terms of the GNU General Public License as published by
7: * the Free Software Foundation; either version 2 of the License, or
8: * (at your option) any later version.
9: *
10: * The GNU General Public License can be found at
11: * http://www.gnu.org/copyleft/gpl.html.
12: *
13: * This script is distributed in the hope that it will be useful,
14: * but WITHOUT ANY WARRANTY; without even the implied warranty of
15: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16: * GNU General Public License for more details.
17: */
18: namespace Html2Text;
19: class Html2Text
20: {
21: const ENCODING = 'UTF-8';
22: /**
23: * Contains the HTML content to convert.
24: *
25: * @type string
26: */
27: protected $html;
28: /**
29: * Contains the converted, formatted text.
30: *
31: * @type string
32: */
33: protected $text;
34: /**
35: * List of preg* regular expression patterns to search for,
36: * used in conjunction with $replace.
37: *
38: * @type array
39: * @see $replace
40: */
41: protected $search = array(
42: "/\r/", // Non-legal carriage return
43: "/[\n\t]+/", // Newlines and tabs
44: '/<head[^>]*>.*?<\/head>/i', // <head>
45: '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
46: '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
47: '/<p[^>]*>/i', // <P>
48: '/<br[^>]*>/i', // <br>
49: '/<i[^>]*>(.*?)<\/i>/i', // <i>
50: '/<em[^>]*>(.*?)<\/em>/i', // <em>
51: '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
52: '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
53: '/(<dl[^>]*>|<\/dl>)/i', // <dl> and </dl>
54: '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
55: '/<dd[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
56: '/<dt[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
57: '/<li[^>]*>/i', // <li>
58: '/<hr[^>]*>/i', // <hr>
59: '/<div[^>]*>/i', // <div>
60: '/(<table[^>]*>|<\/table>)/i', // <table> and </table>
61: '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
62: '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
63: '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
64: );
65: /**
66: * List of pattern replacements corresponding to patterns searched.
67: *
68: * @type array
69: * @see $search
70: */
71: protected $replace = array(
72: '', // Non-legal carriage return
73: ' ', // Newlines and tabs
74: '', // <head>
75: '', // <script>s -- which strip_tags supposedly has problems with
76: '', // <style>s -- which strip_tags supposedly has problems with
77: "\n\n", // <P>
78: "\n", // <br>
79: '_\\1_', // <i>
80: '_\\1_', // <em>
81: "\n\n", // <ul> and </ul>
82: "\n\n", // <ol> and </ol>
83: "\n\n", // <dl> and </dl>
84: "\t* \\1\n", // <li> and </li>
85: " \\1\n", // <dd> and </dd>
86: "\t* \\1", // <dt> and </dt>
87: "\n\t* ", // <li>
88: "\n-------------------------\n", // <hr>
89: "<div>\n", // <div>
90: "\n\n", // <table> and </table>
91: "\n", // <tr> and </tr>
92: "\t\t\\1\n", // <td> and </td>
93: "" // <span class="_html2text_ignore">...</span>
94: );
95: /**
96: * List of preg* regular expression patterns to search for,
97: * used in conjunction with $entReplace.
98: *
99: * @type array
100: * @see $entReplace
101: */
102: protected $entSearch = array(
103: '/™/i', // TM symbol in win-1252
104: '/—/i', // m-dash in win-1252
105: '/&(amp|#38);/i', // Ampersand: see converter()
106: '/[ ]{2,}/', // Runs of spaces, post-handling
107: );
108: /**
109: * List of pattern replacements corresponding to patterns searched.
110: *
111: * @type array
112: * @see $entSearch
113: */
114: protected $entReplace = array(
115: '™', // TM symbol
116: '—', // m-dash
117: '|+|amp|+|', // Ampersand: see converter()
118: ' ', // Runs of spaces, post-handling
119: );
120: /**
121: * List of preg* regular expression patterns to search for
122: * and replace using callback function.
123: *
124: * @type array
125: */
126: protected $callbackSearch = array(
127: '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
128: '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
129: '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
130: '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
131: '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href="">
132: );
133: /**
134: * List of preg* regular expression patterns to search for in PRE body,
135: * used in conjunction with $preReplace.
136: *
137: * @type array
138: * @see $preReplace
139: */
140: protected $preSearch = array(
141: "/\n/",
142: "/\t/",
143: '/ /',
144: '/<pre[^>]*>/',
145: '/<\/pre>/'
146: );
147: /**
148: * List of pattern replacements corresponding to patterns searched for PRE body.
149: *
150: * @type array
151: * @see $preSearch
152: */
153: protected $preReplace = array(
154: '<br>',
155: ' ',
156: ' ',
157: '',
158: '',
159: );
160: /**
161: * Temporary workspace used during PRE processing.
162: *
163: * @type string
164: */
165: protected $preContent = '';
166: /**
167: * Contains the base URL that relative links should resolve to.
168: *
169: * @type string
170: */
171: protected $baseurl = '';
172: /**
173: * Indicates whether content in the $html variable has been converted yet.
174: *
175: * @type boolean
176: * @see $html, $text
177: */
178: protected $converted = false;
179: /**
180: * Contains URL addresses from links to be rendered in plain text.
181: *
182: * @type array
183: * @see buildlinkList()
184: */
185: protected $linkList = array();
186: /**
187: * Various configuration options (able to be set in the constructor)
188: *
189: * @type array
190: */
191: protected $options = array(
192: 'do_links' => 'inline', // 'none'
193: // 'inline' (show links inline)
194: // 'nextline' (show links on the next line)
195: // 'table' (if a table of link URLs should be listed after the text.
196: 'width' => 70, // Maximum width of the formatted text, in columns.
197: // Set this value to 0 (or less) to ignore word wrapping
198: // and not constrain text to a fixed-width column.
199: );
200: private function legacyConstruct($html = '', $fromFile = false, array $options = array())
201: {
202: $this->set_html($html, $fromFile);
203: $this->options = array_merge($this->options, $options);
204: }
205: /**
206: * @param string $html Source HTML
207: * @param array $options Set configuration options
208: */
209: public function __construct($html = '', $options = array())
210: {
211: // for backwards compatibility
212: if (!is_array($options)) {
213: return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
214: }
215: $this->html = $html;
216: $this->options = array_merge($this->options, $options);
217: }
218: /**
219: * Set the source HTML
220: *
221: * @param string $html HTML source content
222: */
223: public function setHtml($html)
224: {
225: $this->html = $html;
226: $this->converted = false;
227: }
228: /**
229: * @deprecated
230: */
231: public function set_html($html, $from_file = false)
232: {
233: if ($from_file) {
234: throw new \InvalidArgumentException("Argument from_file no longer supported");
235: }
236: return $this->setHtml($html);
237: }
238: /**
239: * Returns the text, converted from HTML.
240: *
241: * @return string
242: */
243: public function getText()
244: {
245: if (!$this->converted) {
246: $this->convert();
247: }
248: return $this->text;
249: }
250: /**
251: * @deprecated
252: */
253: public function get_text()
254: {
255: return $this->getText();
256: }
257: /**
258: * @deprecated
259: */
260: public function print_text()
261: {
262: print $this->getText();
263: }
264: /**
265: * @deprecated
266: */
267: public function p()
268: {
269: return $this->print_text();
270: }
271: /**
272: * Sets a base URL to handle relative links.
273: *
274: * @param string $baseurl
275: */
276: public function setBaseUrl($baseurl)
277: {
278: $this->baseurl = $baseurl;
279: }
280: /**
281: * @deprecated
282: */
283: public function set_base_url($baseurl)
284: {
285: return $this->setBaseUrl($baseurl);
286: }
287: protected function convert()
288: {
289: $this->linkList = array();
290: $text = trim(stripslashes($this->html));
291: $this->converter($text);
292: if ($this->linkList) {
293: $text .= "\n\nLinks:\n------\n";
294: foreach ($this->linkList as $i => $url) {
295: $text .= '[' . ($i + 1) . '] ' . $url . "\n";
296: }
297: }
298: $this->text = $text;
299: $this->converted = true;
300: }
301: protected function converter(&$text)
302: {
303: $this->convertBlockquotes($text);
304: $this->convertPre($text);
305: $text = preg_replace($this->search, $this->replace, $text);
306: $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
307: $text = strip_tags($text);
308: $text = preg_replace($this->entSearch, $this->entReplace, $text);
309: $text = html_entity_decode($text, ENT_QUOTES, self::ENCODING);
310: // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
311: $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
312: // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
313: // This properly handles situation of "&quot;" in input string
314: $text = str_replace('|+|amp|+|', '&', $text);
315: // Normalise empty lines
316: $text = preg_replace("/\n\s+\n/", "\n\n", $text);
317: $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
318: // remove leading empty lines (can be produced by eg. P tag on the beginning)
319: $text = ltrim($text, "\n");
320: if ($this->options['width'] > 0) {
321: $text = wordwrap($text, $this->options['width']);
322: }
323: }
324: /**
325: * Helper function called by preg_replace() on link replacement.
326: *
327: * Maintains an internal list of links to be displayed at the end of the
328: * text, with numeric indices to the original point in the text they
329: * appeared. Also makes an effort at identifying and handling absolute
330: * and relative links.
331: *
332: * @param string $link URL of the link
333: * @param string $display Part of the text to associate number with
334: * @param null $linkOverride
335: * @return string
336: */
337: protected function buildlinkList($link, $display, $linkOverride = null)
338: {
339: $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
340: if ($linkMethod == 'none') {
341: return $display;
342: }
343: // Ignored link types
344: if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
345: return $display;
346: }
347: if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
348: $url = $link;
349: } else {
350: $url = $this->baseurl;
351: if (substr($link, 0, 1) != '/') {
352: $url .= '/';
353: }
354: $url .= $link;
355: }
356: if ($linkMethod == 'table') {
357: if (($index = array_search($url, $this->linkList)) === false) {
358: $index = count($this->linkList);
359: $this->linkList[] = $url;
360: }
361: return $display . ' [' . ($index + 1) . ']';
362: } elseif ($linkMethod == 'nextline') {
363: return $display . "\n[" . $url . ']';
364: } else { // link_method defaults to inline
365: return $display . ' [' . $url . ']';
366: }
367: }
368: protected function convertPre(&$text)
369: {
370: // get the content of PRE element
371: while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
372: $this->preContent = $matches[1];
373: // Run our defined tags search-and-replace with callback
374: $this->preContent = preg_replace_callback(
375: $this->callbackSearch,
376: array($this, 'pregCallback'),
377: $this->preContent
378: );
379: // convert the content
380: $this->preContent = sprintf(
381: '<div><br>%s<br></div>',
382: preg_replace($this->preSearch, $this->preReplace, $this->preContent)
383: );
384: // replace the content (use callback because content can contain $0 variable)
385: $text = preg_replace_callback(
386: '/<pre[^>]*>.*<\/pre>/ismU',
387: array($this, 'pregPreCallback'),
388: $text,
389: 1
390: );
391: // free memory
392: $this->preContent = '';
393: }
394: }
395: /**
396: * Helper function for BLOCKQUOTE body conversion.
397: *
398: * @param string $text HTML content
399: */
400: protected function convertBlockquotes(&$text)
401: {
402: if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
403: $start = 0;
404: $taglen = 0;
405: $level = 0;
406: $diff = 0;
407: foreach ($matches[0] as $m) {
408: if ($m[0][0] == '<' && $m[0][1] == '/') {
409: $level--;
410: if ($level < 0) {
411: $level = 0; // malformed HTML: go to next blockquote
412: } elseif ($level > 0) {
413: // skip inner blockquote
414: } else {
415: $end = $m[1];
416: $len = $end - $taglen - $start;
417: // Get blockquote content
418: $body = substr($text, $start + $taglen - $diff, $len);
419: // Set text width
420: $pWidth = $this->options['width'];
421: if ($this->options['width'] > 0) $this->options['width'] -= 2;
422: // Convert blockquote content
423: $body = trim($body);
424: $this->converter($body);
425: // Add citation markers and create PRE block
426: $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
427: $body = '<pre>' . htmlspecialchars($body) . '</pre>';
428: // Re-set text width
429: $this->options['width'] = $pWidth;
430: // Replace content
431: $text = substr($text, 0, $start - $diff)
432: . $body . substr($text, $end + strlen($m[0]) - $diff);
433: $diff = $len + $taglen + strlen($m[0]) - strlen($body);
434: unset($body);
435: }
436: } else {
437: if ($level == 0) {
438: $start = $m[1];
439: $taglen = strlen($m[0]);
440: }
441: $level++;
442: }
443: }
444: }
445: }
446: /**
447: * Callback function for preg_replace_callback use.
448: *
449: * @param array $matches PREG matches
450: * @return string
451: */
452: protected function pregCallback($matches)
453: {
454: switch (strtolower($matches[1])) {
455: case 'b':
456: case 'strong':
457: return $this->toupper($matches[3]);
458: case 'th':
459: return $this->toupper("\t\t" . $matches[3] . "\n");
460: case 'h':
461: return $this->toupper("\n\n" . $matches[3] . "\n\n");
462: case 'a':
463: // override the link method
464: $linkOverride = null;
465: if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
466: $linkOverride = $linkOverrideMatch[1];
467: }
468: // Remove spaces in URL (#1487805)
469: $url = str_replace(' ', '', $matches[3]);
470: return $this->buildlinkList($url, $matches[5], $linkOverride);
471: }
472: return '';
473: }
474: /**
475: * Callback function for preg_replace_callback use in PRE content handler.
476: *
477: * @param array $matches PREG matches
478: * @return string
479: */
480: protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
481: {
482: return $this->preContent;
483: }
484: /**
485: * Strtoupper function with HTML tags and entities handling.
486: *
487: * @param string $str Text to convert
488: * @return string Converted text
489: */
490: private function toupper($str)
491: {
492: // string can contain HTML tags
493: $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
494: // convert toupper only the text between HTML tags
495: foreach ($chunks as $i => $chunk) {
496: if ($chunk[0] != '<') {
497: $chunks[$i] = $this->strtoupper($chunk);
498: }
499: }
500: return implode($chunks);
501: }
502: /**
503: * Strtoupper multibyte wrapper function with HTML entities handling.
504: *
505: * @param string $str Text to convert
506: * @return string Converted text
507: */
508: private function strtoupper($str)
509: {
510: $str = html_entity_decode($str, ENT_COMPAT, self::ENCODING);
511: if (function_exists('mb_strtoupper')) {
512: $str = mb_strtoupper($str, self::ENCODING);
513: } else {
514: $str = strtoupper($str);
515: }
516: $str = htmlspecialchars($str, ENT_COMPAT, self::ENCODING);
517: return $str;
518: }
519: }
520: