html_dom.php 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328
  1. <?php
  2. ////////// This is a PHP 7.3 compatible version of dom.php
  3. ////////// https://github.com/Kub-AT/php-simple-html-dom-parser
  4. define('HDOM_TYPE_ELEMENT', 1);
  5. define('HDOM_TYPE_COMMENT', 2);
  6. define('HDOM_TYPE_TEXT', 3);
  7. define('HDOM_TYPE_ENDTAG', 4);
  8. define('HDOM_TYPE_ROOT', 5);
  9. define('HDOM_TYPE_UNKNOWN', 6);
  10. define('HDOM_QUOTE_DOUBLE', 0);
  11. define('HDOM_QUOTE_SINGLE', 1);
  12. define('HDOM_QUOTE_NO', 3);
  13. define('HDOM_INFO_BEGIN', 0);
  14. define('HDOM_INFO_END', 1);
  15. define('HDOM_INFO_QUOTE', 2);
  16. define('HDOM_INFO_SPACE', 3);
  17. define('HDOM_INFO_TEXT', 4);
  18. define('HDOM_INFO_INNER', 5);
  19. define('HDOM_INFO_OUTER', 6);
  20. define('HDOM_INFO_ENDSPACE', 7);
  21. defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  22. defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
  23. defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
  24. defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
  25. define('HDOM_SMARTY_AS_TEXT', 1);
  26. function file_get_html(
  27. $url,
  28. $use_include_path = false,
  29. $context = null,
  30. $offset = 0,
  31. $maxLen = -1,
  32. $lowercase = true,
  33. $forceTagsClosed = true,
  34. $target_charset = DEFAULT_TARGET_CHARSET,
  35. $stripRN = true,
  36. $defaultBRText = DEFAULT_BR_TEXT,
  37. $defaultSpanText = DEFAULT_SPAN_TEXT)
  38. {
  39. if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
  40. $dom = new simple_html_dom(
  41. null,
  42. $lowercase,
  43. $forceTagsClosed,
  44. $target_charset,
  45. $stripRN,
  46. $defaultBRText,
  47. $defaultSpanText
  48. );
  49. /**
  50. * For sourceforge users: uncomment the next line and comment the
  51. * retrieve_url_contents line 2 lines down if it is not already done.
  52. */
  53. $contents = file_get_contents(
  54. $url,
  55. $use_include_path,
  56. $context,
  57. $offset,
  58. $maxLen
  59. );
  60. // $contents = retrieve_url_contents($url);
  61. if (empty($contents) || strlen($contents) > $maxLen) {
  62. $dom->clear();
  63. return false;
  64. }
  65. return $dom->load($contents, $lowercase, $stripRN);
  66. }
  67. function str_get_html(
  68. $str,
  69. $lowercase = true,
  70. $forceTagsClosed = true,
  71. $target_charset = DEFAULT_TARGET_CHARSET,
  72. $stripRN = true,
  73. $defaultBRText = DEFAULT_BR_TEXT,
  74. $defaultSpanText = DEFAULT_SPAN_TEXT)
  75. {
  76. $dom = new simple_html_dom(
  77. null,
  78. $lowercase,
  79. $forceTagsClosed,
  80. $target_charset,
  81. $stripRN,
  82. $defaultBRText,
  83. $defaultSpanText
  84. );
  85. if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
  86. $dom->clear();
  87. return false;
  88. }
  89. return $dom->load($str, $lowercase, $stripRN);
  90. }
  91. function dump_html_tree($node, $show_attr = true, $deep = 0)
  92. {
  93. $node->dump($node);
  94. }
  95. class simple_html_dom_node
  96. {
  97. public $nodetype = HDOM_TYPE_TEXT;
  98. public $tag = 'text';
  99. public $attr = array();
  100. public $children = array();
  101. public $nodes = array();
  102. public $parent = null;
  103. public $_ = array();
  104. public $tag_start = 0;
  105. private $dom = null;
  106. function __construct($dom)
  107. {
  108. $this->dom = $dom;
  109. $dom->nodes[] = $this;
  110. }
  111. function __destruct()
  112. {
  113. $this->clear();
  114. }
  115. function __toString()
  116. {
  117. return $this->outertext();
  118. }
  119. function clear()
  120. {
  121. $this->dom = null;
  122. $this->nodes = null;
  123. $this->parent = null;
  124. $this->children = null;
  125. }
  126. function dump($show_attr = true, $depth = 0)
  127. {
  128. echo str_repeat("\t", $depth) . $this->tag;
  129. if ($show_attr && count($this->attr) > 0) {
  130. echo '(';
  131. foreach ($this->attr as $k => $v) {
  132. echo "[$k]=>\"$v\", ";
  133. }
  134. echo ')';
  135. }
  136. echo "\n";
  137. if ($this->nodes) {
  138. foreach ($this->nodes as $node) {
  139. $node->dump($show_attr, $depth + 1);
  140. }
  141. }
  142. }
  143. function dump_node($echo = true)
  144. {
  145. $string = $this->tag;
  146. if (count($this->attr) > 0) {
  147. $string .= '(';
  148. foreach ($this->attr as $k => $v) {
  149. $string .= "[$k]=>\"$v\", ";
  150. }
  151. $string .= ')';
  152. }
  153. if (count($this->_) > 0) {
  154. $string .= ' $_ (';
  155. foreach ($this->_ as $k => $v) {
  156. if (is_array($v)) {
  157. $string .= "[$k]=>(";
  158. foreach ($v as $k2 => $v2) {
  159. $string .= "[$k2]=>\"$v2\", ";
  160. }
  161. $string .= ')';
  162. } else {
  163. $string .= "[$k]=>\"$v\", ";
  164. }
  165. }
  166. $string .= ')';
  167. }
  168. if (isset($this->text)) {
  169. $string .= " text: ({$this->text})";
  170. }
  171. $string .= ' HDOM_INNER_INFO: ';
  172. if (isset($node->_[HDOM_INFO_INNER])) {
  173. $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
  174. } else {
  175. $string .= ' NULL ';
  176. }
  177. $string .= ' children: ' . count($this->children);
  178. $string .= ' nodes: ' . count($this->nodes);
  179. $string .= ' tag_start: ' . $this->tag_start;
  180. $string .= "\n";
  181. if ($echo) {
  182. echo $string;
  183. return;
  184. } else {
  185. return $string;
  186. }
  187. }
  188. function parent($parent = null)
  189. {
  190. // I am SURE that this doesn't work properly.
  191. // It fails to unset the current node from it's current parents nodes or
  192. // children list first.
  193. if ($parent !== null) {
  194. $this->parent = $parent;
  195. $this->parent->nodes[] = $this;
  196. $this->parent->children[] = $this;
  197. }
  198. return $this->parent;
  199. }
  200. function has_child()
  201. {
  202. return !empty($this->children);
  203. }
  204. function children($idx = -1)
  205. {
  206. if ($idx === -1) {
  207. return $this->children;
  208. }
  209. if (isset($this->children[$idx])) {
  210. return $this->children[$idx];
  211. }
  212. return null;
  213. }
  214. function first_child()
  215. {
  216. if (count($this->children) > 0) {
  217. return $this->children[0];
  218. }
  219. return null;
  220. }
  221. function last_child()
  222. {
  223. if (count($this->children) > 0) {
  224. return end($this->children);
  225. }
  226. return null;
  227. }
  228. function next_sibling()
  229. {
  230. if ($this->parent === null) {
  231. return null;
  232. }
  233. $idx = array_search($this, $this->parent->children, true);
  234. if ($idx !== false && isset($this->parent->children[$idx + 1])) {
  235. return $this->parent->children[$idx + 1];
  236. }
  237. return null;
  238. }
  239. function prev_sibling()
  240. {
  241. if ($this->parent === null) {
  242. return null;
  243. }
  244. $idx = array_search($this, $this->parent->children, true);
  245. if ($idx !== false && $idx > 0) {
  246. return $this->parent->children[$idx - 1];
  247. }
  248. return null;
  249. }
  250. function find_ancestor_tag($tag)
  251. {
  252. global $debug_object;
  253. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  254. if ($this->parent === null) {
  255. return null;
  256. }
  257. $ancestor = $this->parent;
  258. while (!is_null($ancestor)) {
  259. if (is_object($debug_object)) {
  260. $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
  261. }
  262. if ($ancestor->tag === $tag) {
  263. break;
  264. }
  265. $ancestor = $ancestor->parent;
  266. }
  267. return $ancestor;
  268. }
  269. function innertext()
  270. {
  271. if (isset($this->_[HDOM_INFO_INNER])) {
  272. return $this->_[HDOM_INFO_INNER];
  273. }
  274. if (isset($this->_[HDOM_INFO_TEXT])) {
  275. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  276. }
  277. $ret = '';
  278. foreach ($this->nodes as $n) {
  279. $ret .= $n->outertext();
  280. }
  281. return $ret;
  282. }
  283. function outertext()
  284. {
  285. global $debug_object;
  286. if (is_object($debug_object)) {
  287. $text = '';
  288. if ($this->tag === 'text') {
  289. if (!empty($this->text)) {
  290. $text = ' with text: ' . $this->text;
  291. }
  292. }
  293. $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
  294. }
  295. if ($this->tag === 'root') {
  296. return $this->innertext();
  297. }
  298. // todo: What is the use of this callback? Remove?
  299. if ($this->dom && $this->dom->callback !== null) {
  300. call_user_func_array($this->dom->callback, array($this));
  301. }
  302. if (isset($this->_[HDOM_INFO_OUTER])) {
  303. return $this->_[HDOM_INFO_OUTER];
  304. }
  305. if (isset($this->_[HDOM_INFO_TEXT])) {
  306. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  307. }
  308. $ret = '';
  309. if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
  310. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  311. }
  312. if (isset($this->_[HDOM_INFO_INNER])) {
  313. // todo: <br> should either never have HDOM_INFO_INNER or always
  314. if ($this->tag !== 'br') {
  315. $ret .= $this->_[HDOM_INFO_INNER];
  316. }
  317. } elseif ($this->nodes) {
  318. foreach ($this->nodes as $n) {
  319. $ret .= $this->convert_text($n->outertext());
  320. }
  321. }
  322. if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
  323. $ret .= '</' . $this->tag . '>';
  324. }
  325. return $ret;
  326. }
  327. function text()
  328. {
  329. if (isset($this->_[HDOM_INFO_INNER])) {
  330. return $this->_[HDOM_INFO_INNER];
  331. }
  332. switch ($this->nodetype) {
  333. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  334. case HDOM_TYPE_COMMENT: return '';
  335. case HDOM_TYPE_UNKNOWN: return '';
  336. }
  337. if (strcasecmp($this->tag, 'script') === 0) { return ''; }
  338. if (strcasecmp($this->tag, 'style') === 0) { return ''; }
  339. $ret = '';
  340. // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
  341. // for some span tags, and some p tags) $this->nodes is set to NULL.
  342. // NOTE: This indicates that there is a problem where it's set to NULL
  343. // without a clear happening.
  344. // WHY is this happening?
  345. if (!is_null($this->nodes)) {
  346. foreach ($this->nodes as $n) {
  347. // Start paragraph after a blank line
  348. if ($n->tag === 'p') {
  349. $ret = trim($ret) . "\n\n";
  350. }
  351. $ret .= $this->convert_text($n->text());
  352. // If this node is a span... add a space at the end of it so
  353. // multiple spans don't run into each other. This is plaintext
  354. // after all.
  355. if ($n->tag === 'span') {
  356. $ret .= $this->dom->default_span_text;
  357. }
  358. }
  359. }
  360. return $ret;
  361. }
  362. function xmltext()
  363. {
  364. $ret = $this->innertext();
  365. $ret = str_ireplace('<![CDATA[', '', $ret);
  366. $ret = str_replace(']]>', '', $ret);
  367. return $ret;
  368. }
  369. function makeup()
  370. {
  371. // text, comment, unknown
  372. if (isset($this->_[HDOM_INFO_TEXT])) {
  373. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  374. }
  375. $ret = '<' . $this->tag;
  376. $i = -1;
  377. foreach ($this->attr as $key => $val) {
  378. ++$i;
  379. // skip removed attribute
  380. if ($val === null || $val === false) { continue; }
  381. $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  382. //no value attr: nowrap, checked selected...
  383. if ($val === true) {
  384. $ret .= $key;
  385. } else {
  386. switch ($this->_[HDOM_INFO_QUOTE][$i])
  387. {
  388. case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  389. case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  390. default: $quote = '';
  391. }
  392. $ret .= $key
  393. . $this->_[HDOM_INFO_SPACE][$i][1]
  394. . '='
  395. . $this->_[HDOM_INFO_SPACE][$i][2]
  396. . $quote
  397. . $val
  398. . $quote;
  399. }
  400. }
  401. $ret = $this->dom->restore_noise($ret);
  402. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  403. }
  404. function find($selector, $idx = null, $lowercase = false)
  405. {
  406. $selectors = $this->parse_selector($selector);
  407. if (($count = count($selectors)) === 0) { return array(); }
  408. $found_keys = array();
  409. // find each selector
  410. for ($c = 0; $c < $count; ++$c) {
  411. // The change on the below line was documented on the sourceforge
  412. // code tracker id 2788009
  413. // used to be: if (($levle=count($selectors[0]))===0) return array();
  414. if (($levle = count($selectors[$c])) === 0) { return array(); }
  415. if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
  416. $head = array($this->_[HDOM_INFO_BEGIN] => 1);
  417. $cmd = ' '; // Combinator
  418. // handle descendant selectors, no recursive!
  419. for ($l = 0; $l < $levle; ++$l) {
  420. $ret = array();
  421. foreach ($head as $k => $v) {
  422. $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
  423. //PaperG - Pass this optional parameter on to the seek function.
  424. $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
  425. }
  426. $head = $ret;
  427. $cmd = $selectors[$c][$l][4]; // Next Combinator
  428. }
  429. foreach ($head as $k => $v) {
  430. if (!isset($found_keys[$k])) {
  431. $found_keys[$k] = 1;
  432. }
  433. }
  434. }
  435. // sort keys
  436. ksort($found_keys);
  437. $found = array();
  438. foreach ($found_keys as $k => $v) {
  439. $found[] = $this->dom->nodes[$k];
  440. }
  441. // return nth-element or array
  442. if (is_null($idx)) { return $found; }
  443. elseif ($idx < 0) { $idx = count($found) + $idx; }
  444. return (isset($found[$idx])) ? $found[$idx] : null;
  445. }
  446. protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
  447. {
  448. global $debug_object;
  449. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  450. list($tag, $id, $class, $attributes, $cmb) = $selector;
  451. $nodes = array();
  452. if ($parent_cmd === ' ') { // Descendant Combinator
  453. // Find parent closing tag if the current element doesn't have a closing
  454. // tag (i.e. void element)
  455. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  456. if ($end == 0) {
  457. $parent = $this->parent;
  458. while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
  459. $end -= 1;
  460. $parent = $parent->parent;
  461. }
  462. $end += $parent->_[HDOM_INFO_END];
  463. }
  464. // Get list of target nodes
  465. $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
  466. $nodes_count = $end - $nodes_start;
  467. $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
  468. } elseif ($parent_cmd === '>') { // Child Combinator
  469. $nodes = $this->children;
  470. } elseif ($parent_cmd === '+'
  471. && $this->parent
  472. && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
  473. $index = array_search($this, $this->parent->children, true) + 1;
  474. if ($index < count($this->parent->children))
  475. $nodes[] = $this->parent->children[$index];
  476. } elseif ($parent_cmd === '~'
  477. && $this->parent
  478. && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
  479. $index = array_search($this, $this->parent->children, true);
  480. $nodes = array_slice($this->parent->children, $index);
  481. }
  482. // Go throgh each element starting at this element until the end tag
  483. // Note: If this element is a void tag, any previous void element is
  484. // skipped.
  485. foreach($nodes as $node) {
  486. $pass = true;
  487. // Skip root nodes
  488. if(!$node->parent) {
  489. $pass = false;
  490. }
  491. // Skip if node isn't a child node (i.e. text nodes)
  492. if($pass && !in_array($node, $node->parent->children, true)) {
  493. $pass = false;
  494. }
  495. // Skip if tag doesn't match
  496. if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
  497. $pass = false;
  498. }
  499. // Skip if ID doesn't exist
  500. if ($pass && $id !== '' && !isset($node->attr['id'])) {
  501. $pass = false;
  502. }
  503. // Check if ID matches
  504. if ($pass && $id !== '' && isset($node->attr['id'])) {
  505. // Note: Only consider the first ID (as browsers do)
  506. $node_id = explode(' ', trim($node->attr['id']))[0];
  507. if($id !== $node_id) { $pass = false; }
  508. }
  509. // Check if all class(es) exist
  510. if ($pass && $class !== '' && is_array($class) && !empty($class)) {
  511. if (isset($node->attr['class'])) {
  512. $node_classes = explode(' ', $node->attr['class']);
  513. if ($lowercase) {
  514. $node_classes = array_map('strtolower', $node_classes);
  515. }
  516. foreach($class as $c) {
  517. if(!in_array($c, $node_classes)) {
  518. $pass = false;
  519. break;
  520. }
  521. }
  522. } else {
  523. $pass = false;
  524. }
  525. }
  526. // Check attributes
  527. if ($pass
  528. && $attributes !== ''
  529. && is_array($attributes)
  530. && !empty($attributes)) {
  531. foreach($attributes as $a) {
  532. list (
  533. $att_name,
  534. $att_expr,
  535. $att_val,
  536. $att_inv,
  537. $att_case_sensitivity
  538. ) = $a;
  539. // Handle indexing attributes (i.e. "[2]")
  540. /**
  541. * Note: This is not supported by the CSS Standard but adds
  542. * the ability to select items compatible to XPath (i.e.
  543. * the 3rd element within it's parent).
  544. *
  545. * Note: This doesn't conflict with the CSS Standard which
  546. * doesn't work on numeric attributes anyway.
  547. */
  548. if (is_numeric($att_name)
  549. && $att_expr === ''
  550. && $att_val === '') {
  551. $count = 0;
  552. // Find index of current element in parent
  553. foreach ($node->parent->children as $c) {
  554. if ($c->tag === $node->tag) ++$count;
  555. if ($c === $node) break;
  556. }
  557. // If this is the correct node, continue with next
  558. // attribute
  559. if ($count === (int)$att_name) continue;
  560. }
  561. // Check attribute availability
  562. if ($att_inv) { // Attribute should NOT be set
  563. if (isset($node->attr[$att_name])) {
  564. $pass = false;
  565. break;
  566. }
  567. } else { // Attribute should be set
  568. // todo: "plaintext" is not a valid CSS selector!
  569. if ($att_name !== 'plaintext'
  570. && !isset($node->attr[$att_name])) {
  571. $pass = false;
  572. break;
  573. }
  574. }
  575. // Continue with next attribute if expression isn't defined
  576. if ($att_expr === '') continue;
  577. // If they have told us that this is a "plaintext"
  578. // search then we want the plaintext of the node - right?
  579. // todo "plaintext" is not a valid CSS selector!
  580. if ($att_name === 'plaintext') {
  581. $nodeKeyValue = $node->text();
  582. } else {
  583. $nodeKeyValue = $node->attr[$att_name];
  584. }
  585. if (is_object($debug_object)) {
  586. $debug_object->debug_log(2,
  587. 'testing node: '
  588. . $node->tag
  589. . ' for attribute: '
  590. . $att_name
  591. . $att_expr
  592. . $att_val
  593. . ' where nodes value is: '
  594. . $nodeKeyValue
  595. );
  596. }
  597. // If lowercase is set, do a case insensitive test of
  598. // the value of the selector.
  599. if ($lowercase) {
  600. $check = $this->match(
  601. $att_expr,
  602. strtolower($att_val),
  603. strtolower($nodeKeyValue),
  604. $att_case_sensitivity
  605. );
  606. } else {
  607. $check = $this->match(
  608. $att_expr,
  609. $att_val,
  610. $nodeKeyValue,
  611. $att_case_sensitivity
  612. );
  613. }
  614. if (is_object($debug_object)) {
  615. $debug_object->debug_log(2,
  616. 'after match: '
  617. . ($check ? 'true' : 'false')
  618. );
  619. }
  620. if (!$check) {
  621. $pass = false;
  622. break;
  623. }
  624. }
  625. }
  626. // Found a match. Add to list and clear node
  627. if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
  628. unset($node);
  629. }
  630. // It's passed by reference so this is actually what this function returns.
  631. if (is_object($debug_object)) {
  632. $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
  633. }
  634. }
  635. protected function match($exp, $pattern, $value, $case_sensitivity)
  636. {
  637. global $debug_object;
  638. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
  639. if ($case_sensitivity === 'i') {
  640. $pattern = strtolower($pattern);
  641. $value = strtolower($value);
  642. }
  643. switch ($exp) {
  644. case '=':
  645. return ($value === $pattern);
  646. case '!=':
  647. return ($value !== $pattern);
  648. case '^=':
  649. return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
  650. case '$=':
  651. return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
  652. case '*=':
  653. return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
  654. case '|=':
  655. /**
  656. * [att|=val]
  657. *
  658. * Represents an element with the att attribute, its value
  659. * either being exactly "val" or beginning with "val"
  660. * immediately followed by "-" (U+002D).
  661. */
  662. return strpos($value, $pattern) === 0;
  663. case '~=':
  664. /**
  665. * [att~=val]
  666. *
  667. * Represents an element with the att attribute whose value is a
  668. * whitespace-separated list of words, one of which is exactly
  669. * "val". If "val" contains whitespace, it will never represent
  670. * anything (since the words are separated by spaces). Also if
  671. * "val" is the empty string, it will never represent anything.
  672. */
  673. return in_array($pattern, explode(' ', trim($value)), true);
  674. }
  675. return false;
  676. }
  677. protected function parse_selector($selector_string)
  678. {
  679. global $debug_object;
  680. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  681. /**
  682. * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
  683. *
  684. * Paperg: Add the colon to the attribute, so that it properly finds
  685. * <tag attr:ibute="something" > like google does.
  686. *
  687. * Note: if you try to look at this attribute, you MUST use getAttribute
  688. * since $dom->x:y will fail the php syntax check.
  689. *
  690. * Notice the \[ starting the attribute? and the @? following? This
  691. * implies that an attribute can begin with an @ sign that is not
  692. * captured. This implies that an html attribute specifier may start
  693. * with an @ sign that is NOT captured by the expression. Farther study
  694. * is required to determine of this should be documented or removed.
  695. *
  696. * Matches selectors in this order:
  697. *
  698. * [0] - full match
  699. *
  700. * [1] - tag name
  701. * ([\w:\*-]*)
  702. * Matches the tag name consisting of zero or more words, colons,
  703. * asterisks and hyphens.
  704. *
  705. * [2] - id name
  706. * (?:\#([\w-]+))
  707. * Optionally matches a id name, consisting of an "#" followed by
  708. * the id name (one or more words and hyphens).
  709. *
  710. * [3] - class names (including dots)
  711. * (?:\.([\w\.-]+))?
  712. * Optionally matches a list of classs, consisting of an "."
  713. * followed by the class name (one or more words and hyphens)
  714. * where multiple classes can be chained (i.e. ".foo.bar.baz")
  715. *
  716. * [4] - attributes
  717. * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
  718. * Optionally matches the attributes list
  719. *
  720. * [5] - separator
  721. * ([\/, >+~]+)
  722. * Matches the selector list separator
  723. */
  724. // phpcs:ignore Generic.Files.LineLength
  725. $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
  726. preg_match_all(
  727. $pattern,
  728. trim($selector_string) . ' ', // Add final ' ' as pseudo separator
  729. $matches,
  730. PREG_SET_ORDER
  731. );
  732. if (is_object($debug_object)) {
  733. $debug_object->debug_log(2, 'Matches Array: ', $matches);
  734. }
  735. $selectors = array();
  736. $result = array();
  737. foreach ($matches as $m) {
  738. $m[0] = trim($m[0]);
  739. // Skip NoOps
  740. if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
  741. // Convert to lowercase
  742. if ($this->dom->lowercase) {
  743. $m[1] = strtolower($m[1]);
  744. }
  745. // Extract classes
  746. if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
  747. /* Extract attributes (pattern based on the pattern above!)
  748. * [0] - full match
  749. * [1] - attribute name
  750. * [2] - attribute expression
  751. * [3] - attribute value
  752. * [4] - case sensitivity
  753. *
  754. * Note: Attributes can be negated with a "!" prefix to their name
  755. */
  756. if($m[4] !== '') {
  757. preg_match_all(
  758. "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
  759. trim($m[4]),
  760. $attributes,
  761. PREG_SET_ORDER
  762. );
  763. // Replace element by array
  764. $m[4] = array();
  765. foreach($attributes as $att) {
  766. // Skip empty matches
  767. if(trim($att[0]) === '') { continue; }
  768. $inverted = (isset($att[1][0]) && $att[1][0] === '!');
  769. $m[4][] = array(
  770. $inverted ? substr($att[1], 1) : $att[1], // Name
  771. (isset($att[2])) ? $att[2] : '', // Expression
  772. (isset($att[3])) ? $att[3] : '', // Value
  773. $inverted, // Inverted Flag
  774. (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
  775. );
  776. }
  777. }
  778. // Sanitize Separator
  779. if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
  780. $m[5] = ' ';
  781. } else { // Other Separator
  782. $m[5] = trim($m[5]);
  783. }
  784. // Clear Separator if it's a Selector List
  785. if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
  786. // Remove full match before adding to results
  787. array_shift($m);
  788. $result[] = $m;
  789. if ($is_list) { // Selector List
  790. $selectors[] = $result;
  791. $result = array();
  792. }
  793. }
  794. if (count($result) > 0) { $selectors[] = $result; }
  795. return $selectors;
  796. }
  797. function __get($name)
  798. {
  799. if (isset($this->attr[$name])) {
  800. return $this->convert_text($this->attr[$name]);
  801. }
  802. switch ($name) {
  803. case 'outertext': return $this->outertext();
  804. case 'innertext': return $this->innertext();
  805. case 'plaintext': return $this->text();
  806. case 'xmltext': return $this->xmltext();
  807. default: return array_key_exists($name, $this->attr);
  808. }
  809. }
  810. function __set($name, $value)
  811. {
  812. global $debug_object;
  813. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  814. switch ($name) {
  815. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  816. case 'innertext':
  817. if (isset($this->_[HDOM_INFO_TEXT])) {
  818. return $this->_[HDOM_INFO_TEXT] = $value;
  819. }
  820. return $this->_[HDOM_INFO_INNER] = $value;
  821. }
  822. if (!isset($this->attr[$name])) {
  823. $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  824. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  825. }
  826. $this->attr[$name] = $value;
  827. }
  828. function __isset($name)
  829. {
  830. switch ($name) {
  831. case 'outertext': return true;
  832. case 'innertext': return true;
  833. case 'plaintext': return true;
  834. }
  835. //no value attr: nowrap, checked selected...
  836. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  837. }
  838. function __unset($name)
  839. {
  840. if (isset($this->attr[$name])) { unset($this->attr[$name]); }
  841. }
  842. function convert_text($text)
  843. {
  844. global $debug_object;
  845. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  846. $converted_text = $text;
  847. $sourceCharset = '';
  848. $targetCharset = '';
  849. if ($this->dom) {
  850. $sourceCharset = strtoupper($this->dom->_charset);
  851. $targetCharset = strtoupper($this->dom->_target_charset);
  852. }
  853. if (is_object($debug_object)) {
  854. $debug_object->debug_log(3,
  855. 'source charset: '
  856. . $sourceCharset
  857. . ' target charaset: '
  858. . $targetCharset
  859. );
  860. }
  861. if (!empty($sourceCharset)
  862. && !empty($targetCharset)
  863. && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
  864. // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
  865. if ((strcasecmp($targetCharset, 'UTF-8') == 0)
  866. && ($this->is_utf8($text))) {
  867. $converted_text = $text;
  868. } else {
  869. $converted_text = iconv($sourceCharset, $targetCharset, $text);
  870. }
  871. }
  872. // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
  873. if ($targetCharset === 'UTF-8') {
  874. if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
  875. $converted_text = substr($converted_text, 3);
  876. }
  877. if (substr($converted_text, -3) === "\xef\xbb\xbf") {
  878. $converted_text = substr($converted_text, 0, -3);
  879. }
  880. }
  881. return $converted_text;
  882. }
  883. static function is_utf8($str)
  884. {
  885. $c = 0; $b = 0;
  886. $bits = 0;
  887. $len = strlen($str);
  888. for($i = 0; $i < $len; $i++) {
  889. $c = ord($str[$i]);
  890. if($c > 128) {
  891. if(($c >= 254)) { return false; }
  892. elseif($c >= 252) { $bits = 6; }
  893. elseif($c >= 248) { $bits = 5; }
  894. elseif($c >= 240) { $bits = 4; }
  895. elseif($c >= 224) { $bits = 3; }
  896. elseif($c >= 192) { $bits = 2; }
  897. else { return false; }
  898. if(($i + $bits) > $len) { return false; }
  899. while($bits > 1) {
  900. $i++;
  901. $b = ord($str[$i]);
  902. if($b < 128 || $b > 191) { return false; }
  903. $bits--;
  904. }
  905. }
  906. }
  907. return true;
  908. }
  909. function get_display_size()
  910. {
  911. global $debug_object;
  912. $width = -1;
  913. $height = -1;
  914. if ($this->tag !== 'img') {
  915. return false;
  916. }
  917. // See if there is aheight or width attribute in the tag itself.
  918. if (isset($this->attr['width'])) {
  919. $width = $this->attr['width'];
  920. }
  921. if (isset($this->attr['height'])) {
  922. $height = $this->attr['height'];
  923. }
  924. // Now look for an inline style.
  925. if (isset($this->attr['style'])) {
  926. // Thanks to user gnarf from stackoverflow for this regular expression.
  927. $attributes = array();
  928. preg_match_all(
  929. '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
  930. $this->attr['style'],
  931. $matches,
  932. PREG_SET_ORDER
  933. );
  934. foreach ($matches as $match) {
  935. $attributes[$match[1]] = $match[2];
  936. }
  937. // If there is a width in the style attributes:
  938. if (isset($attributes['width']) && $width == -1) {
  939. // check that the last two characters are px (pixels)
  940. if (strtolower(substr($attributes['width'], -2)) === 'px') {
  941. $proposed_width = substr($attributes['width'], 0, -2);
  942. // Now make sure that it's an integer and not something stupid.
  943. if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
  944. $width = $proposed_width;
  945. }
  946. }
  947. }
  948. // If there is a width in the style attributes:
  949. if (isset($attributes['height']) && $height == -1) {
  950. // check that the last two characters are px (pixels)
  951. if (strtolower(substr($attributes['height'], -2)) == 'px') {
  952. $proposed_height = substr($attributes['height'], 0, -2);
  953. // Now make sure that it's an integer and not something stupid.
  954. if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
  955. $height = $proposed_height;
  956. }
  957. }
  958. }
  959. }
  960. // Future enhancement:
  961. // Look in the tag to see if there is a class or id specified that has
  962. // a height or width attribute to it.
  963. // Far future enhancement
  964. // Look at all the parent tags of this image to see if they specify a
  965. // class or id that has an img selector that specifies a height or width
  966. // Note that in this case, the class or id will have the img subselector
  967. // for it to apply to the image.
  968. // ridiculously far future development
  969. // If the class or id is specified in a SEPARATE css file thats not on
  970. // the page, go get it and do what we were just doing for the ones on
  971. // the page.
  972. $result = array(
  973. 'height' => $height,
  974. 'width' => $width
  975. );
  976. return $result;
  977. }
  978. function save($filepath = '')
  979. {
  980. $ret = $this->outertext();
  981. if ($filepath !== '') {
  982. file_put_contents($filepath, $ret, LOCK_EX);
  983. }
  984. return $ret;
  985. }
  986. function addClass($class)
  987. {
  988. if (is_string($class)) {
  989. $class = explode(' ', $class);
  990. }
  991. if (is_array($class)) {
  992. foreach($class as $c) {
  993. if (isset($this->class)) {
  994. if ($this->hasClass($c)) {
  995. continue;
  996. } else {
  997. $this->class .= ' ' . $c;
  998. }
  999. } else {
  1000. $this->class = $c;
  1001. }
  1002. }
  1003. } else {
  1004. if (is_object($debug_object)) {
  1005. $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1006. }
  1007. }
  1008. }
  1009. function hasClass($class)
  1010. {
  1011. if (is_string($class)) {
  1012. if (isset($this->class)) {
  1013. return in_array($class, explode(' ', $this->class), true);
  1014. }
  1015. } else {
  1016. if (is_object($debug_object)) {
  1017. $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1018. }
  1019. }
  1020. return false;
  1021. }
  1022. function removeClass($class = null)
  1023. {
  1024. if (!isset($this->class)) {
  1025. return;
  1026. }
  1027. if (is_null($class)) {
  1028. $this->removeAttribute('class');
  1029. return;
  1030. }
  1031. if (is_string($class)) {
  1032. $class = explode(' ', $class);
  1033. }
  1034. if (is_array($class)) {
  1035. $class = array_diff(explode(' ', $this->class), $class);
  1036. if (empty($class)) {
  1037. $this->removeAttribute('class');
  1038. } else {
  1039. $this->class = implode(' ', $class);
  1040. }
  1041. }
  1042. }
  1043. function getAllAttributes()
  1044. {
  1045. return $this->attr;
  1046. }
  1047. function getAttribute($name)
  1048. {
  1049. return $this->__get($name);
  1050. }
  1051. function setAttribute($name, $value)
  1052. {
  1053. $this->__set($name, $value);
  1054. }
  1055. function hasAttribute($name)
  1056. {
  1057. return $this->__isset($name);
  1058. }
  1059. function removeAttribute($name)
  1060. {
  1061. $this->__set($name, null);
  1062. }
  1063. function remove()
  1064. {
  1065. if ($this->parent) {
  1066. $this->parent->removeChild($this);
  1067. }
  1068. }
  1069. function removeChild($node)
  1070. {
  1071. $nidx = array_search($node, $this->nodes, true);
  1072. $cidx = array_search($node, $this->children, true);
  1073. $didx = array_search($node, $this->dom->nodes, true);
  1074. if ($nidx !== false && $cidx !== false && $didx !== false) {
  1075. foreach($node->children as $child) {
  1076. $node->removeChild($child);
  1077. }
  1078. foreach($node->nodes as $entity) {
  1079. $enidx = array_search($entity, $node->nodes, true);
  1080. $edidx = array_search($entity, $node->dom->nodes, true);
  1081. if ($enidx !== false && $edidx !== false) {
  1082. unset($node->nodes[$enidx]);
  1083. unset($node->dom->nodes[$edidx]);
  1084. }
  1085. }
  1086. unset($this->nodes[$nidx]);
  1087. unset($this->children[$cidx]);
  1088. unset($this->dom->nodes[$didx]);
  1089. $node->clear();
  1090. }
  1091. }
  1092. function getElementById($id)
  1093. {
  1094. return $this->find("#$id", 0);
  1095. }
  1096. function getElementsById($id, $idx = null)
  1097. {
  1098. return $this->find("#$id", $idx);
  1099. }
  1100. function getElementByTagName($name)
  1101. {
  1102. return $this->find($name, 0);
  1103. }
  1104. function getElementsByTagName($name, $idx = null)
  1105. {
  1106. return $this->find($name, $idx);
  1107. }
  1108. function parentNode()
  1109. {
  1110. return $this->parent();
  1111. }
  1112. function childNodes($idx = -1)
  1113. {
  1114. return $this->children($idx);
  1115. }
  1116. function firstChild()
  1117. {
  1118. return $this->first_child();
  1119. }
  1120. function lastChild()
  1121. {
  1122. return $this->last_child();
  1123. }
  1124. function nextSibling()
  1125. {
  1126. return $this->next_sibling();
  1127. }
  1128. function previousSibling()
  1129. {
  1130. return $this->prev_sibling();
  1131. }
  1132. function hasChildNodes()
  1133. {
  1134. return $this->has_child();
  1135. }
  1136. function nodeName()
  1137. {
  1138. return $this->tag;
  1139. }
  1140. function appendChild($node)
  1141. {
  1142. $node->parent($this);
  1143. return $node;
  1144. }
  1145. }
  1146. class simple_html_dom
  1147. {
  1148. public $root = null;
  1149. public $nodes = array();
  1150. public $callback = null;
  1151. public $lowercase = false;
  1152. public $original_size;
  1153. public $size;
  1154. protected $pos;
  1155. protected $doc;
  1156. protected $char;
  1157. protected $cursor;
  1158. protected $parent;
  1159. protected $noise = array();
  1160. protected $token_blank = " \t\r\n";
  1161. protected $token_equal = ' =/>';
  1162. protected $token_slash = " />\r\n\t";
  1163. protected $token_attr = ' >';
  1164. public $_charset = '';
  1165. public $_target_charset = '';
  1166. protected $default_br_text = '';
  1167. public $default_span_text = '';
  1168. protected $self_closing_tags = array(
  1169. 'area' => 1,
  1170. 'base' => 1,
  1171. 'br' => 1,
  1172. 'col' => 1,
  1173. 'embed' => 1,
  1174. 'hr' => 1,
  1175. 'img' => 1,
  1176. 'input' => 1,
  1177. 'link' => 1,
  1178. 'meta' => 1,
  1179. 'param' => 1,
  1180. 'source' => 1,
  1181. 'track' => 1,
  1182. 'wbr' => 1
  1183. );
  1184. protected $block_tags = array(
  1185. 'body' => 1,
  1186. 'div' => 1,
  1187. 'form' => 1,
  1188. 'root' => 1,
  1189. 'span' => 1,
  1190. 'table' => 1
  1191. );
  1192. protected $optional_closing_tags = array(
  1193. // Not optional, see
  1194. // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
  1195. 'b' => array('b' => 1),
  1196. 'dd' => array('dd' => 1, 'dt' => 1),
  1197. // Not optional, see
  1198. // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
  1199. 'dl' => array('dd' => 1, 'dt' => 1),
  1200. 'dt' => array('dd' => 1, 'dt' => 1),
  1201. 'li' => array('li' => 1),
  1202. 'optgroup' => array('optgroup' => 1, 'option' => 1),
  1203. 'option' => array('optgroup' => 1, 'option' => 1),
  1204. 'p' => array('p' => 1),
  1205. 'rp' => array('rp' => 1, 'rt' => 1),
  1206. 'rt' => array('rp' => 1, 'rt' => 1),
  1207. 'td' => array('td' => 1, 'th' => 1),
  1208. 'th' => array('td' => 1, 'th' => 1),
  1209. 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
  1210. );
  1211. function __construct(
  1212. $str = null,
  1213. $lowercase = true,
  1214. $forceTagsClosed = true,
  1215. $target_charset = DEFAULT_TARGET_CHARSET,
  1216. $stripRN = true,
  1217. $defaultBRText = DEFAULT_BR_TEXT,
  1218. $defaultSpanText = DEFAULT_SPAN_TEXT,
  1219. $options = 0)
  1220. {
  1221. if ($str) {
  1222. if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
  1223. $this->load_file($str);
  1224. } else {
  1225. $this->load(
  1226. $str,
  1227. $lowercase,
  1228. $stripRN,
  1229. $defaultBRText,
  1230. $defaultSpanText,
  1231. $options
  1232. );
  1233. }
  1234. }
  1235. // Forcing tags to be closed implies that we don't trust the html, but
  1236. // it can lead to parsing errors if we SHOULD trust the html.
  1237. if (!$forceTagsClosed) {
  1238. $this->optional_closing_array = array();
  1239. }
  1240. $this->_target_charset = $target_charset;
  1241. }
  1242. function __destruct()
  1243. {
  1244. $this->clear();
  1245. }
  1246. function load(
  1247. $str,
  1248. $lowercase = true,
  1249. $stripRN = true,
  1250. $defaultBRText = DEFAULT_BR_TEXT,
  1251. $defaultSpanText = DEFAULT_SPAN_TEXT,
  1252. $options = 0)
  1253. {
  1254. global $debug_object;
  1255. // prepare
  1256. $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
  1257. // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
  1258. // Script tags removal now preceeds style tag removal.
  1259. // strip out <script> tags
  1260. $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
  1261. $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
  1262. // strip out the \r \n's if we are told to.
  1263. if ($stripRN) {
  1264. $this->doc = str_replace("\r", ' ', $this->doc);
  1265. $this->doc = str_replace("\n", ' ', $this->doc);
  1266. // set the length of content since we have changed it.
  1267. $this->size = strlen($this->doc);
  1268. }
  1269. // strip out cdata
  1270. $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
  1271. // strip out comments
  1272. $this->remove_noise("'<!--(.*?)-->'is");
  1273. // strip out <style> tags
  1274. $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
  1275. $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
  1276. // strip out preformatted tags
  1277. $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
  1278. // strip out server side scripts
  1279. $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
  1280. if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
  1281. $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
  1282. }
  1283. // parsing
  1284. $this->parse();
  1285. // end
  1286. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1287. $this->parse_charset();
  1288. // make load function chainable
  1289. return $this;
  1290. }
  1291. function load_file()
  1292. {
  1293. $args = func_get_args();
  1294. if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
  1295. $this->load($doc, true);
  1296. } else {
  1297. return false;
  1298. }
  1299. }
  1300. function set_callback($function_name)
  1301. {
  1302. $this->callback = $function_name;
  1303. }
  1304. function remove_callback()
  1305. {
  1306. $this->callback = null;
  1307. }
  1308. function save($filepath = '')
  1309. {
  1310. $ret = $this->root->innertext();
  1311. if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
  1312. return $ret;
  1313. }
  1314. function find($selector, $idx = null, $lowercase = false)
  1315. {
  1316. return $this->root->find($selector, $idx, $lowercase);
  1317. }
  1318. function clear()
  1319. {
  1320. if (isset($this->nodes)) {
  1321. foreach ($this->nodes as $n) {
  1322. $n->clear();
  1323. $n = null;
  1324. }
  1325. }
  1326. // This add next line is documented in the sourceforge repository.
  1327. // 2977248 as a fix for ongoing memory leaks that occur even with the
  1328. // use of clear.
  1329. if (isset($this->children)) {
  1330. foreach ($this->children as $n) {
  1331. $n->clear();
  1332. $n = null;
  1333. }
  1334. }
  1335. if (isset($this->parent)) {
  1336. $this->parent->clear();
  1337. unset($this->parent);
  1338. }
  1339. if (isset($this->root)) {
  1340. $this->root->clear();
  1341. unset($this->root);
  1342. }
  1343. unset($this->doc);
  1344. unset($this->noise);
  1345. }
  1346. function dump($show_attr = true)
  1347. {
  1348. $this->root->dump($show_attr);
  1349. }
  1350. protected function prepare(
  1351. $str, $lowercase = true,
  1352. $defaultBRText = DEFAULT_BR_TEXT,
  1353. $defaultSpanText = DEFAULT_SPAN_TEXT)
  1354. {
  1355. $this->clear();
  1356. $this->doc = trim($str);
  1357. $this->size = strlen($this->doc);
  1358. $this->original_size = $this->size; // original size of the html
  1359. $this->pos = 0;
  1360. $this->cursor = 1;
  1361. $this->noise = array();
  1362. $this->nodes = array();
  1363. $this->lowercase = $lowercase;
  1364. $this->default_br_text = $defaultBRText;
  1365. $this->default_span_text = $defaultSpanText;
  1366. $this->root = new simple_html_dom_node($this);
  1367. $this->root->tag = 'root';
  1368. $this->root->_[HDOM_INFO_BEGIN] = -1;
  1369. $this->root->nodetype = HDOM_TYPE_ROOT;
  1370. $this->parent = $this->root;
  1371. if ($this->size > 0) { $this->char = $this->doc[0]; }
  1372. }
  1373. protected function parse()
  1374. {
  1375. while (true) {
  1376. // Read next tag if there is no text between current position and the
  1377. // next opening tag.
  1378. if (($s = $this->copy_until_char('<')) === '') {
  1379. if($this->read_tag()) {
  1380. continue;
  1381. } else {
  1382. return true;
  1383. }
  1384. }
  1385. // Add a text node for text between tags
  1386. $node = new simple_html_dom_node($this);
  1387. ++$this->cursor;
  1388. $node->_[HDOM_INFO_TEXT] = $s;
  1389. $this->link_nodes($node, false);
  1390. }
  1391. }
  1392. protected function parse_charset()
  1393. {
  1394. global $debug_object;
  1395. $charset = null;
  1396. if (function_exists('get_last_retrieve_url_contents_content_type')) {
  1397. $contentTypeHeader = get_last_retrieve_url_contents_content_type();
  1398. $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
  1399. if ($success) {
  1400. $charset = $matches[1];
  1401. if (is_object($debug_object)) {
  1402. $debug_object->debug_log(2,
  1403. 'header content-type found charset of: '
  1404. . $charset
  1405. );
  1406. }
  1407. }
  1408. }
  1409. if (empty($charset)) {
  1410. // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
  1411. $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
  1412. if (!empty($el)) {
  1413. $fullvalue = $el->content;
  1414. if (is_object($debug_object)) {
  1415. $debug_object->debug_log(2,
  1416. 'meta content-type tag found'
  1417. . $fullvalue
  1418. );
  1419. }
  1420. if (!empty($fullvalue)) {
  1421. $success = preg_match(
  1422. '/charset=(.+)/i',
  1423. $fullvalue,
  1424. $matches
  1425. );
  1426. if ($success) {
  1427. $charset = $matches[1];
  1428. } else {
  1429. // If there is a meta tag, and they don't specify the
  1430. // character set, research says that it's typically
  1431. // ISO-8859-1
  1432. if (is_object($debug_object)) {
  1433. $debug_object->debug_log(2,
  1434. 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
  1435. );
  1436. }
  1437. $charset = 'ISO-8859-1';
  1438. }
  1439. }
  1440. }
  1441. }
  1442. if (empty($charset)) {
  1443. // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
  1444. if ($meta = $this->root->find('meta[charset]', 0)) {
  1445. $charset = $meta->charset;
  1446. if (is_object($debug_object)) {
  1447. $debug_object->debug_log(2, 'meta charset: ' . $charset);
  1448. }
  1449. }
  1450. }
  1451. if (empty($charset)) {
  1452. // Try to guess the charset based on the content
  1453. // Requires Multibyte String (mbstring) support (optional)
  1454. if (function_exists('mb_detect_encoding')) {
  1455. /**
  1456. * mb_detect_encoding() is not intended to distinguish between
  1457. * charsets, especially single-byte charsets. Its primary
  1458. * purpose is to detect which multibyte encoding is in use,
  1459. * i.e. UTF-8, UTF-16, shift-JIS, etc.
  1460. *
  1461. * -- https://bugs.php.net/bug.php?id=38138
  1462. *
  1463. * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
  1464. * always result in CP1251/ISO-8859-5 and vice versa.
  1465. *
  1466. * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
  1467. * to stay compatible.
  1468. */
  1469. $encoding = mb_detect_encoding(
  1470. $this->doc,
  1471. array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
  1472. );
  1473. if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
  1474. // Due to a limitation of mb_detect_encoding
  1475. // 'CP1251'/'ISO-8859-5' will be detected as
  1476. // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
  1477. // which case we can simply assume it is the other charset.
  1478. if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
  1479. $encoding = 'CP1251';
  1480. }
  1481. }
  1482. if ($encoding !== false) {
  1483. $charset = $encoding;
  1484. if (is_object($debug_object)) {
  1485. $debug_object->debug_log(2, 'mb_detect: ' . $charset);
  1486. }
  1487. }
  1488. }
  1489. }
  1490. if (empty($charset)) {
  1491. // Assume it's UTF-8 as it is the most likely charset to be used
  1492. $charset = 'UTF-8';
  1493. if (is_object($debug_object)) {
  1494. $debug_object->debug_log(2, 'No match found, assume ' . $charset);
  1495. }
  1496. }
  1497. // Since CP1252 is a superset, if we get one of it's subsets, we want
  1498. // it instead.
  1499. if ((strtolower($charset) == 'iso-8859-1')
  1500. || (strtolower($charset) == 'latin1')
  1501. || (strtolower($charset) == 'latin-1')) {
  1502. $charset = 'CP1252';
  1503. if (is_object($debug_object)) {
  1504. $debug_object->debug_log(2,
  1505. 'replacing ' . $charset . ' with CP1252 as its a superset'
  1506. );
  1507. }
  1508. }
  1509. if (is_object($debug_object)) {
  1510. $debug_object->debug_log(1, 'EXIT - ' . $charset);
  1511. }
  1512. return $this->_charset = $charset;
  1513. }
  1514. protected function read_tag()
  1515. {
  1516. // Set end position if no further tags found
  1517. if ($this->char !== '<') {
  1518. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1519. return false;
  1520. }
  1521. $begin_tag_pos = $this->pos;
  1522. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1523. // end tag
  1524. if ($this->char === '/') {
  1525. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1526. // Skip whitespace in end tags (i.e. in "</ html>")
  1527. $this->skip($this->token_blank);
  1528. $tag = $this->copy_until_char('>');
  1529. // Skip attributes in end tags
  1530. if (($pos = strpos($tag, ' ')) !== false) {
  1531. $tag = substr($tag, 0, $pos);
  1532. }
  1533. $parent_lower = strtolower($this->parent->tag);
  1534. $tag_lower = strtolower($tag);
  1535. // The end tag is supposed to close the parent tag. Handle situations
  1536. // when it doesn't
  1537. if ($parent_lower !== $tag_lower) {
  1538. // Parent tag does not have to be closed necessarily (optional closing tag)
  1539. // Current tag is a block tag, so it may close an ancestor
  1540. if (isset($this->optional_closing_tags[$parent_lower])
  1541. && isset($this->block_tags[$tag_lower])) {
  1542. $this->parent->_[HDOM_INFO_END] = 0;
  1543. $org_parent = $this->parent;
  1544. // Traverse ancestors to find a matching opening tag
  1545. // Stop at root node
  1546. while (($this->parent->parent)
  1547. && strtolower($this->parent->tag) !== $tag_lower
  1548. ){
  1549. $this->parent = $this->parent->parent;
  1550. }
  1551. // If we don't have a match add current tag as text node
  1552. if (strtolower($this->parent->tag) !== $tag_lower) {
  1553. $this->parent = $org_parent; // restore origonal parent
  1554. if ($this->parent->parent) {
  1555. $this->parent = $this->parent->parent;
  1556. }
  1557. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1558. return $this->as_text_node($tag);
  1559. }
  1560. } elseif (($this->parent->parent)
  1561. && isset($this->block_tags[$tag_lower])
  1562. ) {
  1563. // Grandparent exists and current tag is a block tag, so our
  1564. // parent doesn't have an end tag
  1565. $this->parent->_[HDOM_INFO_END] = 0; // No end tag
  1566. $org_parent = $this->parent;
  1567. // Traverse ancestors to find a matching opening tag
  1568. // Stop at root node
  1569. while (($this->parent->parent)
  1570. && strtolower($this->parent->tag) !== $tag_lower
  1571. ) {
  1572. $this->parent = $this->parent->parent;
  1573. }
  1574. // If we don't have a match add current tag as text node
  1575. if (strtolower($this->parent->tag) !== $tag_lower) {
  1576. $this->parent = $org_parent; // restore origonal parent
  1577. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1578. return $this->as_text_node($tag);
  1579. }
  1580. } elseif (($this->parent->parent)
  1581. && strtolower($this->parent->parent->tag) === $tag_lower
  1582. ) { // Grandparent exists and current tag closes it
  1583. $this->parent->_[HDOM_INFO_END] = 0;
  1584. $this->parent = $this->parent->parent;
  1585. } else { // Random tag, add as text node
  1586. return $this->as_text_node($tag);
  1587. }
  1588. }
  1589. // Set end position of parent tag to current cursor position
  1590. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1591. if ($this->parent->parent) {
  1592. $this->parent = $this->parent->parent;
  1593. }
  1594. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1595. return true;
  1596. }
  1597. // start tag
  1598. $node = new simple_html_dom_node($this);
  1599. $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  1600. ++$this->cursor;
  1601. $tag = $this->copy_until($this->token_slash); // Get tag name
  1602. $node->tag_start = $begin_tag_pos;
  1603. // doctype, cdata & comments...
  1604. // <!DOCTYPE html>
  1605. // <![CDATA[ ... ]]>
  1606. // <!-- Comment -->
  1607. if (isset($tag[0]) && $tag[0] === '!') {
  1608. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
  1609. if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
  1610. $node->nodetype = HDOM_TYPE_COMMENT;
  1611. $node->tag = 'comment';
  1612. } else { // Could be doctype or CDATA but we don't care
  1613. $node->nodetype = HDOM_TYPE_UNKNOWN;
  1614. $node->tag = 'unknown';
  1615. }
  1616. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1617. $this->link_nodes($node, true);
  1618. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1619. return true;
  1620. }
  1621. // The start tag cannot contain another start tag, if so add as text
  1622. // i.e. "<<html>"
  1623. if ($pos = strpos($tag, '<') !== false) {
  1624. $tag = '<' . substr($tag, 0, -1);
  1625. $node->_[HDOM_INFO_TEXT] = $tag;
  1626. $this->link_nodes($node, false);
  1627. $this->char = $this->doc[--$this->pos]; // prev
  1628. return true;
  1629. }
  1630. // Handle invalid tag names (i.e. "<html#doc>")
  1631. if (!preg_match('/^\w[\w:-]*$/', $tag)) {
  1632. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
  1633. // Next char is the beginning of a new tag, don't touch it.
  1634. if ($this->char === '<') {
  1635. $this->link_nodes($node, false);
  1636. return true;
  1637. }
  1638. // Next char closes current tag, add and be done with it.
  1639. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1640. $this->link_nodes($node, false);
  1641. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1642. return true;
  1643. }
  1644. // begin tag, add new node
  1645. $node->nodetype = HDOM_TYPE_ELEMENT;
  1646. $tag_lower = strtolower($tag);
  1647. $node->tag = ($this->lowercase) ? $tag_lower : $tag;
  1648. // handle optional closing tags
  1649. if (isset($this->optional_closing_tags[$tag_lower])) {
  1650. // Traverse ancestors to close all optional closing tags
  1651. while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
  1652. $this->parent->_[HDOM_INFO_END] = 0;
  1653. $this->parent = $this->parent->parent;
  1654. }
  1655. $node->parent = $this->parent;
  1656. }
  1657. $guard = 0; // prevent infinity loop
  1658. // [0] Space between tag and first attribute
  1659. $space = array($this->copy_skip($this->token_blank), '', '');
  1660. // attributes
  1661. do {
  1662. // Everything until the first equal sign should be the attribute name
  1663. $name = $this->copy_until($this->token_equal);
  1664. if ($name === '' && $this->char !== null && $space[0] === '') {
  1665. break;
  1666. }
  1667. if ($guard === $this->pos) { // Escape infinite loop
  1668. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1669. continue;
  1670. }
  1671. $guard = $this->pos;
  1672. // handle endless '<'
  1673. // Out of bounds before the tag ended
  1674. if ($this->pos >= $this->size - 1 && $this->char !== '>') {
  1675. $node->nodetype = HDOM_TYPE_TEXT;
  1676. $node->_[HDOM_INFO_END] = 0;
  1677. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
  1678. $node->tag = 'text';
  1679. $this->link_nodes($node, false);
  1680. return true;
  1681. }
  1682. // handle mismatch '<'
  1683. // Attributes cannot start after opening tag
  1684. if ($this->doc[$this->pos - 1] == '<') {
  1685. $node->nodetype = HDOM_TYPE_TEXT;
  1686. $node->tag = 'text';
  1687. $node->attr = array();
  1688. $node->_[HDOM_INFO_END] = 0;
  1689. $node->_[HDOM_INFO_TEXT] = substr(
  1690. $this->doc,
  1691. $begin_tag_pos,
  1692. $this->pos - $begin_tag_pos - 1
  1693. );
  1694. $this->pos -= 2;
  1695. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1696. $this->link_nodes($node, false);
  1697. return true;
  1698. }
  1699. if ($name !== '/' && $name !== '') { // this is a attribute name
  1700. // [1] Whitespace after attribute name
  1701. $space[1] = $this->copy_skip($this->token_blank);
  1702. $name = $this->restore_noise($name); // might be a noisy name
  1703. if ($this->lowercase) { $name = strtolower($name); }
  1704. if ($this->char === '=') { // attribute with value
  1705. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1706. $this->parse_attr($node, $name, $space); // get attribute value
  1707. } else {
  1708. //no value attr: nowrap, checked selected...
  1709. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1710. $node->attr[$name] = true;
  1711. if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
  1712. }
  1713. $node->_[HDOM_INFO_SPACE][] = $space;
  1714. // prepare for next attribute
  1715. $space = array(
  1716. $this->copy_skip($this->token_blank),
  1717. '',
  1718. ''
  1719. );
  1720. } else { // no more attributes
  1721. break;
  1722. }
  1723. } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
  1724. $this->link_nodes($node, true);
  1725. $node->_[HDOM_INFO_ENDSPACE] = $space[0];
  1726. // handle empty tags (i.e. "<div/>")
  1727. if ($this->copy_until_char('>') === '/') {
  1728. $node->_[HDOM_INFO_ENDSPACE] .= '/';
  1729. $node->_[HDOM_INFO_END] = 0;
  1730. } else {
  1731. // reset parent
  1732. if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
  1733. $this->parent = $node;
  1734. }
  1735. }
  1736. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1737. // If it's a BR tag, we need to set it's text to the default text.
  1738. // This way when we see it in plaintext, we can generate formatting that the user wants.
  1739. // since a br tag never has sub nodes, this works well.
  1740. if ($node->tag === 'br') {
  1741. $node->_[HDOM_INFO_INNER] = $this->default_br_text;
  1742. }
  1743. return true;
  1744. }
  1745. protected function parse_attr($node, $name, &$space)
  1746. {
  1747. $is_duplicate = isset($node->attr[$name]);
  1748. if (!$is_duplicate) // Copy whitespace between "=" and value
  1749. $space[2] = $this->copy_skip($this->token_blank);
  1750. switch ($this->char) {
  1751. case '"':
  1752. $quote_type = HDOM_QUOTE_DOUBLE;
  1753. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1754. $value = $this->copy_until_char('"');
  1755. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1756. break;
  1757. case '\'':
  1758. $quote_type = HDOM_QUOTE_SINGLE;
  1759. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1760. $value = $this->copy_until_char('\'');
  1761. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1762. break;
  1763. default:
  1764. $quote_type = HDOM_QUOTE_NO;
  1765. $value = $this->copy_until($this->token_attr);
  1766. }
  1767. $value = $this->restore_noise($value);
  1768. // PaperG: Attributes should not have \r or \n in them, that counts as
  1769. // html whitespace.
  1770. $value = str_replace("\r", '', $value);
  1771. $value = str_replace("\n", '', $value);
  1772. // PaperG: If this is a "class" selector, lets get rid of the preceeding
  1773. // and trailing space since some people leave it in the multi class case.
  1774. if ($name === 'class') {
  1775. $value = trim($value);
  1776. }
  1777. if (!$is_duplicate) {
  1778. $node->_[HDOM_INFO_QUOTE][] = $quote_type;
  1779. $node->attr[$name] = $value;
  1780. }
  1781. }
  1782. protected function link_nodes(&$node, $is_child)
  1783. {
  1784. $node->parent = $this->parent;
  1785. $this->parent->nodes[] = $node;
  1786. if ($is_child) {
  1787. $this->parent->children[] = $node;
  1788. }
  1789. }
  1790. protected function as_text_node($tag)
  1791. {
  1792. $node = new simple_html_dom_node($this);
  1793. ++$this->cursor;
  1794. $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
  1795. $this->link_nodes($node, false);
  1796. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1797. return true;
  1798. }
  1799. protected function skip($chars)
  1800. {
  1801. $this->pos += strspn($this->doc, $chars, $this->pos);
  1802. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1803. }
  1804. protected function copy_skip($chars)
  1805. {
  1806. $pos = $this->pos;
  1807. $len = strspn($this->doc, $chars, $pos);
  1808. $this->pos += $len;
  1809. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1810. if ($len === 0) { return ''; }
  1811. return substr($this->doc, $pos, $len);
  1812. }
  1813. protected function copy_until($chars)
  1814. {
  1815. $pos = $this->pos;
  1816. $len = strcspn($this->doc, $chars, $pos);
  1817. $this->pos += $len;
  1818. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1819. return substr($this->doc, $pos, $len);
  1820. }
  1821. protected function copy_until_char($char)
  1822. {
  1823. if ($this->char === null) { return ''; }
  1824. if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
  1825. $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
  1826. $this->char = null;
  1827. $this->pos = $this->size;
  1828. return $ret;
  1829. }
  1830. if ($pos === $this->pos) { return ''; }
  1831. $pos_old = $this->pos;
  1832. $this->char = $this->doc[$pos];
  1833. $this->pos = $pos;
  1834. return substr($this->doc, $pos_old, $pos - $pos_old);
  1835. }
  1836. protected function remove_noise($pattern, $remove_tag = false)
  1837. {
  1838. global $debug_object;
  1839. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1840. $count = preg_match_all(
  1841. $pattern,
  1842. $this->doc,
  1843. $matches,
  1844. PREG_SET_ORDER | PREG_OFFSET_CAPTURE
  1845. );
  1846. for ($i = $count - 1; $i > -1; --$i) {
  1847. $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
  1848. if (is_object($debug_object)) {
  1849. $debug_object->debug_log(2, 'key is: ' . $key);
  1850. }
  1851. $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
  1852. $this->noise[$key] = $matches[$i][$idx][0];
  1853. $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
  1854. }
  1855. // reset the length of content
  1856. $this->size = strlen($this->doc);
  1857. if ($this->size > 0) {
  1858. $this->char = $this->doc[0];
  1859. }
  1860. }
  1861. function restore_noise($text)
  1862. {
  1863. global $debug_object;
  1864. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1865. while (($pos = strpos($text, '___noise___')) !== false) {
  1866. // Sometimes there is a broken piece of markup, and we don't GET the
  1867. // pos+11 etc... token which indicates a problem outside of us...
  1868. // todo: "___noise___1000" (or any number with four or more digits)
  1869. // in the DOM causes an infinite loop which could be utilized by
  1870. // malicious software
  1871. if (strlen($text) > $pos + 15) {
  1872. $key = '___noise___'
  1873. . $text[$pos + 11]
  1874. . $text[$pos + 12]
  1875. . $text[$pos + 13]
  1876. . $text[$pos + 14]
  1877. . $text[$pos + 15];
  1878. if (is_object($debug_object)) {
  1879. $debug_object->debug_log(2, 'located key of: ' . $key);
  1880. }
  1881. if (isset($this->noise[$key])) {
  1882. $text = substr($text, 0, $pos)
  1883. . $this->noise[$key]
  1884. . substr($text, $pos + 16);
  1885. } else {
  1886. // do this to prevent an infinite loop.
  1887. $text = substr($text, 0, $pos)
  1888. . 'UNDEFINED NOISE FOR KEY: '
  1889. . $key
  1890. . substr($text, $pos + 16);
  1891. }
  1892. } else {
  1893. // There is no valid key being given back to us... We must get
  1894. // rid of the ___noise___ or we will have a problem.
  1895. $text = substr($text, 0, $pos)
  1896. . 'NO NUMERIC NOISE KEY'
  1897. . substr($text, $pos + 11);
  1898. }
  1899. }
  1900. return $text;
  1901. }
  1902. function search_noise($text)
  1903. {
  1904. global $debug_object;
  1905. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1906. foreach($this->noise as $noiseElement) {
  1907. if (strpos($noiseElement, $text) !== false) {
  1908. return $noiseElement;
  1909. }
  1910. }
  1911. }
  1912. function __toString()
  1913. {
  1914. return $this->root->innertext();
  1915. }
  1916. function __get($name)
  1917. {
  1918. switch ($name) {
  1919. case 'outertext':
  1920. return $this->root->innertext();
  1921. case 'innertext':
  1922. return $this->root->innertext();
  1923. case 'plaintext':
  1924. return $this->root->text();
  1925. case 'charset':
  1926. return $this->_charset;
  1927. case 'target_charset':
  1928. return $this->_target_charset;
  1929. }
  1930. }
  1931. function childNodes($idx = -1)
  1932. {
  1933. return $this->root->childNodes($idx);
  1934. }
  1935. function firstChild()
  1936. {
  1937. return $this->root->first_child();
  1938. }
  1939. function lastChild()
  1940. {
  1941. return $this->root->last_child();
  1942. }
  1943. function createElement($name, $value = null)
  1944. {
  1945. return @str_get_html("<$name>$value</$name>")->firstChild();
  1946. }
  1947. function createTextNode($value)
  1948. {
  1949. return @end(str_get_html($value)->nodes);
  1950. }
  1951. function getElementById($id)
  1952. {
  1953. return $this->find("#$id", 0);
  1954. }
  1955. function getElementsById($id, $idx = null)
  1956. {
  1957. return $this->find("#$id", $idx);
  1958. }
  1959. function getElementByTagName($name)
  1960. {
  1961. return $this->find($name, 0);
  1962. }
  1963. function getElementsByTagName($name, $idx = -1)
  1964. {
  1965. return $this->find($name, $idx);
  1966. }
  1967. function loadFile()
  1968. {
  1969. $args = func_get_args();
  1970. $this->load_file($args);
  1971. }
  1972. }