有这样的javascript解析器吗?

阿辉哥 发布于 2010/04/30 14:37
阅读 1K+
收藏 2

最近在做网页内容解析方面的工作,用了一下nutch,发现像水木 (newsmth.net) 之类的网站,解析不了啊。我分析了一下,水木的版面并不是常见的html,而是javascript写成的,例如下面的一段。这样的内容,用nutch自带的js和html的parser解析不了,不知道哪个开源的js解析器能做到啊?

<script type="text/javascript"><!--
  var c = new docWriter('Pondoflotus',148,8316,0,0,279,8345,'/groups/talk.faq/Pondoflotus',1,1);
  c.o(184545,184543,'mushgirl',' ',1271737212,'Re: 安静的荷塘.. ',4,0,0);
  c.o(184546,184543,'w19820108',' ',1271741853,'Re: 安静的荷塘.. ',4,0,0);
  c.o(184547,184543,'LeoAnrus',' ',1271751328,'Re: 安静的荷塘.. ',27,0,0);
  c.o(184548,184543,'Kun',' ',1271765071,'Re: 安静的荷塘.. ',13,0,0);
  c.o(184549,184388,'smileamei','b ',1271768614,'[米拉与白马]对的味道 ',1332,0,0);
  c.o(184550,184388,'smileamei','b ',1271770002,'[米拉与白马]祝福 ',1006,0,0);
  c.o(184551,184388,'odcat',' ',1271771124,'Re: [米拉与白马]对的味道 ',8,0,0);
  c.o(184552,184552,'tjttb','b ',1271811818,'不求甚解 ',552,0,0);
  c.o(184553,184543,'LeoAnrus',' ',1271816130,'Re: 安静的荷塘.. ',27,0,0);
  c.o(184554,184552,'seraphfei',' ',1271822235,'Re: 不求甚解 ',36,0,0);
  c.o(184555,184552,'WaterMonster',' ',1272023266,'Re: 不求甚解 ',46,0,0);
  c.o(184556,184388,'guomister',' ',1272033686,'Re: [米拉与白马]祝福 ',26,0,0);
  c.o(184557,184557,'xixiaowul',' ',1272043581,'hi~荷塘 ',13,0,0);
  c.o(184558,184557,'xixiaowul',' ',1272045660,'Re: hi~荷塘 ',12,0,0);
  c.o(184559,184552,'tjttb',' ',1272122470,'Re: 不求甚解 ',44,0,0);
  c.o(184560,184557,'tjttb',' ',1272122502,'Re: hi~荷塘 ',11,0,0);
  c.o(184561,184557,'tjttb',' ',1272122526,'Re: hi~荷塘 ',6,0,0);
  c.o(184562,184562,'tjttb','g ',1272124565,'拿个罐子接着 ',919,0,0);
  c.o(184563,184557,'tjttb',' ',1272141287,'Re: hi~荷塘 ',12,0,0);
  c.o(184564,184562,'odcat',' ',1272157891,'Re: 拿个罐子接着 ',10,0,0);
  c.o(184565,184530,'skitee',' ',1272267183,'Re: 大家好~~.. ',6,0,0);
  c.o(184567,184388,'skitee',' ',1272269016,'Re: [米拉与白马]对的味道 ',317,0,0);
  c.o(184568,184562,'skitee',' ',1272269262,'Re: 拿个罐子接着 ',11,0,0);
  c.o(184569,184388,'smileamei',' ',1272376982,'Re: [米拉与白马]祝福 ',52,0,0);
  c.o(184570,184388,'smileamei',' ',1272377017,'Re: [米拉与白马]对的味道 ',20,0,0);
  c.o(184571,184388,'smileamei',' ',1272377036,'Re: [米拉与白马]对的味道 ',11,0,0);
  c.o(184573,184573,'tjttb',' ',1272513594,'吧唧嘴 ',695,0,0);
  c.o(17340,17340,'elsa','d ',1066531257,'本版治版方针 ',986,0,0);
  c.o(182692,182692,'longfeiyang','d ',1172074323,'=======荷塘月色版 FAQ======= ',3270,0,0);
  c.o(0,0,'SYSOP','d ',1270878471,'请版面尽快产生一名或多名版主 ',179,0,0);
  c.t();c.f('',['Memory','Girl','Single','Heartsong','LoveManage','Boy','Heart'],0);
  //-->
  </script>
加载中
0
悟庭
悟庭

这种时候就应该用DOM..无论网站是HTM,还是JS动态写成的, DOM都能干, 因为DOM本来就是动态的..

0
hikari
hikari

上文怎么看都像一个csv,特例的话,自己写就可以了吧。

较好的开源javascript解析器当然是gecko和webkit的

返回顶部
顶部