heritrix爬取友人网(http://mobile.younet.com/)信息后遇到的问题

陈浩雄 发布于 2010/05/16 20:58
阅读 2K+
收藏 1

最近在使用heritrix爬取了http://mobile.younet.com/网站的网站产品页面后,在运行写入main函数的 Extractor后,控制台并没有出现所想要的信息,只有count输出为0 的信息,我由于初学实在是解决不出来,贴出我用的两个类Extractor和ExtractYounetMobile希望大家能帮我找找是什么原因了

package com.backSearch.extractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Parser;

import com.backSearch.extractor.younet.ExtractYounetMobile;

public abstract class Extractor {

	protected static final String NEWLINE = "\r\n";

	/**
	 * 表示所有结果的输出路径
	 */
	private String outputPath = "";

	/**
	 * 表示当前正在被处理的文件
	 */
	private String inputFilePath;

	/**
	 * 表示当前所有被抓取的网页的镜象根目录 在Heritrix用mirror目录表示
	 */
	private String mirrorDir = "";

	/**
	 * 用于存放被处理过后的产口的图片的目录
	 */
	private String imageDir = "";

	/**
	 * HTMLParser的实例
	 */
	private Parser parser;

	/**
	 * 对图片路径进行哈希的算法,这里采用MD5算法
	 */
	protected static final String HASH_ALGORITHM = "md5";

	/**
	 * 分隔符
	 */
	public static final String SEPARATOR = "======================";

	/**
	 * 装载需要的网页文件
	 * 
	 */
	public void loadFile(String path) {
		try {
			parser = new Parser(path);
			inputFilePath = path;
			parser.setEncoding("UTF-8");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * 获取输出的路径
	 */
	public String getOutputPath() {
		return outputPath;
	}

	/**
	 * 设置输出的路径,通常在初始化Extractor时就应该做
	 */
	public void setOutputPath(String outputPath) {
		this.outputPath = outputPath;
	}

	public Parser getParser() {
		return parser;
	}

	/**
	 * 使用正则来匹配并获得网页中的字符串
	 */
	protected String getProp(String pattern, String match, int index) {
		Pattern sp = Pattern.compile(pattern);
		Matcher matcher = sp.matcher(match);
		while (matcher.find()) {
			return matcher.group(index);
		}
		return null;
	}

	/**
	 * 抽象方法,用于供子类实现。 其功能主要是解释网页文件 将产品信息保存到
	 * 
	 */
	public abstract void extract();

	/**
	 * 获取正在处理的文件的路径
	 */
	public String getInputFilePath() {
		return inputFilePath;
	}

	/**
	 * 从mirror目录下拷贝文件至所设定的图片目录
	 * 该方法可能需要被改变
	 */
	protected boolean copyImage(String image_url, String new_image_file) {

		String dirs = image_url.substring(7);

		try {
			// instance the File as file_in and file_out
			File file_in = new File(new File(mirrorDir), dirs);
			if (file_in == null || !file_in.exists()) {
				file_in = new File("f:\\sousuo\\noimage.jpg");
			}
			
			File file_out = new File(new File(imageDir), new_image_file);

			FileInputStream in1 = new FileInputStream(file_in);
			FileOutputStream out1 = new FileOutputStream(file_out);

			byte[] bytes = new byte[1024];
			int c;
			while ((c = in1.read(bytes)) != -1)
				out1.write(bytes, 0, c);

			// close
			in1.close();
			out1.close();
			return (true); // if success then return true
		} catch (Exception e) {
			e.printStackTrace();
			return (false); // if fail then return false
		}
	}

	public String getImageDir() {
		return imageDir;
	}

	public void setImageDir(String imageDir) {
		this.imageDir = imageDir;
	}

	public String getMirrorDir() {
		return mirrorDir;
	}

	public void setMirrorDir(String mirrorDir) {
		this.mirrorDir = mirrorDir;
	}

	public void setInputFilePath(String inputFilePath) {
		this.inputFilePath = inputFilePath;
	}

//	public static void main(String[] args) throws Exception {
//
//		Extractor extractor = new Extract163Moblie();
//		extractor.setOutputPath("c:\\product\\test\\mobile\\");
//		extractor.setImageDir("c:\\product\\test\\image\\");
//		extractor.setMirrorDir("F:\\data\\163手机\\mirror\\");
//		
//		traverse(extractor, new File("F:\\data\\163手机\\mirror\\mobile.163.com\\0011\\product\\0011000B\\product"));
//		System.out.println(count);
//
//	}
	static int count = 0;
	
	public static void main(String[] args) throws Exception {

		Extractor extractor = new ExtractYounetMobile();
		extractor.setOutputPath("F:\\product\\mobile\\");
		extractor.setImageDir("F:\\product\\image\\");
		extractor.setMirrorDir("F:\\learn\\Workspaces\\MyEclipse 7.0\\heritrixProject_1\\jobs\\YounetMobile-20100514064948846\\mirror\\");
		
		//try {
			//long s = System.currentTimeMillis();
			traverse(extractor, new File("F:\\learn\\Workspaces\\MyEclipse 7.0\\heritrixProject_1\\jobs\\YounetMobile-20100514064948846\\mirror\\mobile.younet.com\\files\\"));
			//long e = System.currentTimeMillis();
			//System.out.println("1---------------------" + e);
			//System.out.println("2---------------------" + s);
			//System.out.println("用时: " + (e - s) / 1000 + " 秒");
			System.out.println("总数" + count);
	//	} catch (Exception e) {
		//	e.printStackTrace();
		//}

	}
	
	
	public static void traverse(Extractor extractor, File path)
			throws Exception {
		if (path == null) {
			return;
		}

		if (path.isDirectory()) {
			String[] files = path.list();
			for (int i = 0; i < files.length; i++) {
				traverse(extractor, new File(path, files[i]));
			}
		} else {
			if (path.getAbsolutePath().endsWith(".html")
					&& path.getAbsolutePath().indexOf("_") == -1) {
				System.out.println(path);
				count++;
				extractor.loadFile(path.getAbsolutePath());
				extractor.extract();
			}
		}
	}

}

package com.backSearch.extractor.younet;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import com.backSearch.extractor.Extractor;
import com.backSearch.util.StringUtils;


public class ExtractYounetMobile extends Extractor {

	@Override
	public void extract() {
		BufferedWriter bw = null;
		NodeFilter title_filter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mo_tit"));
		NodeFilter attribute_filter = new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"))));
		NodeFilter img_filter = new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
		
		//提取标题信息
		try {
			//Parser根据过滤器返回所有满足过滤条件的节点
			// 迭代逐渐查找
    		NodeList nodeList=this.getParser().parse(title_filter);
			NodeIterator it = nodeList.elements();
			StringBuffer title = new StringBuffer();
			while (it.hasMoreNodes()) {
				Node node = (Node) it.nextNode();
				String[] names = node.toPlainTextString().split(" ");
				for(int i = 0; i < names.length; i++)
					title.append(names[i]).append("-");
				title.append(new Date().getTime());
				//创建要生成的文件
				bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
				//获取当前提取页的完整URL地址
				int startPos = this.getInputFilePath().indexOf("mirror") + 6;
				String url_seg = this.getInputFilePath().substring(startPos);
				url_seg = url_seg.replaceAll("\\\\", "/");
				String url = "http:/" + url_seg;
				//写入当前提取页的完整URL地址
				bw.write(url + NEWLINE);
				bw.write(names[0] + NEWLINE);
				bw.write(names[1] + NEWLINE);
				
			}
			// 重置Parser
			this.getParser().reset();
			Parser attNameParser = null;
			Parser attValueParser = null;
            //Parser parser=new Parser("http://www.sina.com.cn");
			NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"));
    		NodeFilter attributeValue_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp2"));
            String attName = "";
            String attValue = "";
            // 迭代逐渐查找
    		nodeList=this.getParser().parse(attribute_filter);
			it = nodeList.elements();
			while (it.hasMoreNodes()) {				
				Node node = (Node) it.nextNode();
				attNameParser = new Parser();
				attNameParser.setEncoding("UTF-8");
				attNameParser.setInputHTML(node.toHtml());
				NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
				attName = attNameNodeList.elements().nextNode().toPlainTextString();
				
				attValueParser = new Parser();
				attValueParser.setEncoding("UTF-8");
				attValueParser.setInputHTML(node.toHtml());
				NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
				attValue = attValueNodeList.elements().nextNode().toPlainTextString();
				bw.write(attName.trim() + attValue.trim());
				bw.newLine();
			}
			// 重置Parser
			this.getParser().reset();
			String imgUrl = "";
			String fileType ="";
			// 迭代逐渐查找
    		nodeList=this.getParser().parse(img_filter);
			it = nodeList.elements();
			while (it.hasMoreNodes()) {				
				Node node = (Node) it.nextNode();
				
				ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
				imgUrl = imgNode.getAttribute("src");				
				fileType = imgUrl.trim().substring(imgUrl
						.lastIndexOf(".") + 1);
				//生成新的图片的文件名
				String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
				//imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
				//利用miorr目录下的图片生成的新的图片
				this.copyImage(imgUrl, new_iamge_file);
				bw.write(SEPARATOR + NEWLINE);
				bw.write(new_iamge_file + NEWLINE);
			}
			
    		
        } catch(Exception e) {
            e.printStackTrace();
        } finally {
        	try{
    			if (bw != null)
    				bw.close();
    		}catch(IOException e){
    			e.printStackTrace();
    		}
        }
		
	}
}






我是在heritrix里面写了一个MobileYounetExtractor  正则表达式选定
“http://mobile.younet.com/choose.php?groupid=1,2,3,4,&tradeid=[\\d]+,& ”
来抓取该网站下的各种手机型号的页面和相关图片。
希望大家能给我点儿帮助,支持邮箱及QQ联系。谢谢大家



 

加载中
0
陈浩雄
陈浩雄

果然是路径问题,我把路径F:\learn\Workspaces\MyEclipse 7.0\heritrixProject_1\jobs\下的YounetMobile-20100514064948846移到F盘根目录下,控制台正常的打印出了各手机品牌页面。不过在我前面制定的F:\product xia de mobile 和image下什么都没有。。。明天研究。。。要熄灯了

0
陈浩雄
陈浩雄

这几天研究了下,htmlparser还是吃不通,将ExtractYounetMobile的代码改了一通后,新的问题马上出现了,控制台提示

F:\YounetMobile-20100514064948846\mirror\mobile.younet.com\files\23\23734.html
java.io.FileNotFoundException: F:\product\mobile\
    诺基亚-N97-Mini黄金版
-1274142325918.txt (文件名、目录名或卷标语法不正确。)

有哪位同仁也在研究这东西,麻烦点拨我一下吧

返回顶部
顶部