iOS解析HTML

长平狐 发布于 2012/08/13 15:44
阅读 883
收藏 0

【开源中国 APP 全新上线】“动弹” 回归、集成大模型对话、畅读技术报告”

xml,json都有大量的库来解析,我们如何解析html呢?

TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。

今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

	
// NSData data contains the document data
// encoding is the NSStringEncoding of the data
// baseURL the documents base URL, i.e. location 
 
CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding);
CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);
const char *enc = CFStringGetCStringPtr(cfencstr, 0);
 
htmlDocPtr _htmlDocument = htmlReadDoc([data bytes],
      [[baseURL absoluteString] UTF8String],
      enc,
      XML_PARSE_NOERROR | XML_PARSE_NOWARNING);
if (_htmlDocument)
{
   xmlFreeDoc(_htmlDocument);
}

xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument;

while (currentNode) 
	{
		// output node if it is an element
		
		if (currentNode->type == XML_ELEMENT_NODE)
		{
			NSMutableArray *attrArray = [NSMutableArray array];
			
			for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next)
			{
				xmlNodePtr contents = attrNode->children;
				
				[attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]];
			}
			
			NSString *attrString = [attrArray componentsJoinedByString:@" "]; 
			
			if ([attrString length])
			{
				attrString = [@" " stringByAppendingString:attrString];
			}
			
			NSLog(@"<%s%@>", currentNode->name, attrString);
		}
		else if (currentNode->type == XML_TEXT_NODE)
		{
			//NSLog(@"%s", currentNode->content);
			NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]);
		}
		else if (currentNode->type == XML_COMMENT_NODE)
		{
			NSLog(@"/* %s */", currentNode->name);
		}
	
		
		if (currentNode && currentNode->children)
		{
			currentNode = currentNode->children;
		}
		else if (currentNode && currentNode->next)
		{
			currentNode = currentNode->next;
		}
		else
		{
			currentNode = currentNode->parent;
			
			// close node
			if (currentNode && currentNode->type == XML_ELEMENT_NODE)
			{
				NSLog(@"</%s>", currentNode->name);
			}
			
			if (currentNode->next)
			{
				currentNode = currentNode->next;
			}
			else 
			{
				while(currentNode)
				{
					currentNode = currentNode->parent;
					if (currentNode && currentNode->type == XML_ELEMENT_NODE)
					{
						NSLog(@"</%s>", currentNode->name);
						if (strcmp((const char *)currentNode->name, "table") == 0)
						{
							NSLog(@"over");
						}
					}
					
					if (currentNode == nodes->nodeTab[0])
					{
						break;
					}
					
					if (currentNode && currentNode->next)
					{
						currentNode = currentNode->next;
						break;
					}
				}
			}
		}
		
		if (currentNode == nodes->nodeTab[0])
		{
			break;
		}
	}


不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents.  还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",

所以我写了这个方法,同时修改node属性的content key.

NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult)
{
	NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary];
	
	if (currentNode->name)
    {
		NSString *currentNodeContent =
        [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding];
		[resultForNode setObject:currentNodeContent forKey:@"nodeName"];
    }
	
	if (currentNode->content)
	{
		NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding];
		
		if (currentNode->type == XML_TEXT_NODE)
		{
			if (currentNode->parent->type == XML_ELEMENT_NODE)
			{
				[parentResult setObject:currentNodeContent forKey:@"nodeContent"];
				return nil;
			}
			
			if (currentNode->parent->type == XML_ATTRIBUTE_NODE)
			{
				[parentResult
				 setObject:
				 [currentNodeContent
				  stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
				 forKey:@"attributeContent"];
				return nil;

			}
		}
	}
	

	
	xmlAttr *attribute = currentNode->properties;
	if (attribute)
    {
		NSMutableArray *attributeArray = [NSMutableArray array];
		while (attribute)
        {
			NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary];
			NSString *attributeName =
            [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding];
			if (attributeName)
            {
				[attributeDictionary setObject:attributeName forKey:@"attributeName"];
            }
			
			if (attribute->children)
            {
				NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary);
				if (childDictionary)
                {
					[attributeDictionary setObject:childDictionary forKey:@"attributeContent"];
                }
            }
			
			if ([attributeDictionary count] > 0)
            {
				[attributeArray addObject:attributeDictionary];
            }
			attribute = attribute->next;
        }
		
		if ([attributeArray count] > 0)
        {
			[resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"];
        }
    }
	
	xmlNodePtr childNode = currentNode->children;
	if (childNode)
    {
		NSMutableArray *childContentArray = [NSMutableArray array];
		while (childNode)
        {
			NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode);
			if (childDictionary)
            {
				[childContentArray addObject:childDictionary];
            }
			childNode = childNode->next;
        }
		if ([childContentArray count] > 0)
        {
			[resultForNode setObject:childContentArray forKey:@"nodeChildArray"];
        }
    }
	
	return resultForNode;
}

TFHppleElement.m里加了两个key 常量

NSString * const TFHppleNodeAttributeContentKey  = @"attributeContent";
NSString * const TFHppleNodeChildArrayKey        = @"nodeChildArray";

并修改获取属性方法为:

- (NSDictionary *) attributes
{
  NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary];
  for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) {
    [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey]
                             forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]];
  }
  return translatedAttributes;
}

并添加获取children node 方法:

- (BOOL) hasChildren
{
	NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey];
	
	if (childs) 
	{
		return  YES;
	}
	
	return  NO;
}

- (NSArray *) children
{
    if ([self hasChildren])
        return [node objectForKey: TFHppleNodeChildArrayKey];
    return nil;
}



最后我还加了一个获取所有content的主法:

- (NSString *)contentsAt:(NSString *)xPathOrCss;

请看 源码



参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html








原文链接: http://blog.csdn.net/favormm/article/details/6794487
加载中
OSCHINA
登录后可查看更多优质内容
返回顶部
顶部