webmagic数抓取问题

KOP_EVENSON 发布于 2015/04/30 08:52
阅读 1K+
收藏 0

@黄亿华 你好,想跟你请教个问题:在使用webmagic数据抓取过程中,发现数据有漏抓的现像。列表页600条,但在实际抓取的过程中只有抓到200多条。代码下图所示:


@TargetUrl("http://www.tuicool.com/articles/\\w+")
@HelpUrl("http://www.tuicool.com/ah/0/\\w+?lang=1")
public class TuiCoolBlog extends AbstractUpdatableEntity implements AfterExtractor {


    @ExtractBy(value = "//div[@class='span8 contant article_detail_bg']//h1/text()", notNull = true)
    private String article_title;

    @ExtractBy(value = "div.article_body", type = ExtractBy.Type.Css)
    private String article_content;

    @ExtractBy(value=("//span[@class='source']//a[@class='cut cut70']/text()"))
    private String article_link_url;

    @ExtractBy(value = "//span[@class='from']//a[@class='cut cut28 from']/text()")
    private String article_sourse;

    @Formatter(formatter = DateTemplateFormatter.class)
    @ExtractBy("//div[@class='article_meta']//span[@class='timestamp']/text()")
    private Date article_publish_time;

    public String getArticle_title() {
        return article_title;
    }

    public void setArticle_title(String article_title) {
        this.article_title = article_title;
    }

    public String getArticle_content() {
        return article_content;
    }

    public void setArticle_content(String article_content) {
        this.article_content = article_content;
    }

    public String getArticle_link_url() {
        return article_link_url;
    }

    public void setArticle_link_url(String article_link_url) {
        this.article_link_url = article_link_url;
    }

    public String getArticle_sourse() {
        return article_sourse;
    }

    public void setArticle_sourse(String article_sourse) {
        this.article_sourse = article_sourse;
    }

    public Date getArticle_publish_time() {
        return article_publish_time;
    }

    public void setArticle_publish_time(Date article_publish_time) {
        this.article_publish_time = article_publish_time;
    }

    public void afterProcess(Page page) {


    }


}



public class DateTemplateFormatter implements ObjectFormatter<Date> {

    public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH🇲🇲ss"};
    private String[] datePatterns = DEFAULT_PATTERN;

    @Override
    public Date format(String raw) throws Exception {
        String date = raw.split("时间")[1].substring(1).trim();
        byte[] bytes = date.getBytes();
        for (int i = 0; i < bytes.length; i++) {
            if (bytes[i] == -95) {
                bytes[i] = 32;
            }
        }
        date = new String(bytes);
        Date publishDaate = DateUtils.parseDate(date, datePatterns);
        return publishDaate;
    }

    @Override
    public Class<Date> clazz() {
        return Date.class;
    }

    @Override
    public void initParam(String[] extra) {

    }
}



@Repository("tuiCoolBlogPipeline")
public class TuiCoolBlogPipeline implements PageModelPipeline<TuiCoolBlog> {

    @Resource
    private ArticleDao articleDao;


    @Override
    public void process(TuiCoolBlog tuiCoolBlog, Task task) {
        articleDao.save(tuiCoolBlog);
    }

}



@Controller
@RequestMapping(value = "/api/v1/webMagic")
public class WebMagicController {
    //日志记录
    private static Logger logger = LoggerFactory.getLogger(WebMagicController.class);

    @Qualifier("tuiCoolBlogPipeline")
    @Autowired(required = true)
    private TuiCoolBlogPipeline tuiCoolBlogPipeline;

    @RequestMapping(value = "/run")
    public void run() {
        OOSpider.create(Site.me().setSleepTime(100), tuiCoolBlogPipeline, TuiCoolBlog.class).addUrl("http://www.tuicool.com/ah/0/0?lang=1").thread(10).run();

    }


}





加载中
0
小白农码
小白农码

你好,想请问下,数据漏抓,要怎么处理,

我设置了失败重复抓取也没用,

漏抓的数据较少,但是每次漏掉的数据都不一样,不是网页问题,

返回顶部
顶部