[Python] HTML 파싱 하기
HTML 파싱 하기
<예제1>
from HTMLParser import HTMLParser
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print "Encountered a start tag:", tag
def handle_endtag(self, tag):
print "Encountered an end tag :", tag
def handle_data(self, data):
print "Encountered some data :", data
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1>Parse me!</h1></body></html>')
------------------------------------------------------------
<PRINT>
Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html
------------------------------------------------------------
<예제2>
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print "Start tag:", tag
for attr in attrs:
print " attr:", attr
def handle_endtag(self, tag):
print "End tag :", tag
def handle_data(self, data):
print "Data :", data
def handle_comment(self, data):
print "Comment :", data
def handle_entityref(self, name):
c = unichr(name2codepoint[name])
print "Named ent:", c
def handle_charref(self, name):
if name.startswith('x'):
c = unichr(int(name[1:], 16))
else:
c = unichr(int(name))
print "Num ent :", c
def handle_decl(self, data):
print "Decl :", data
parser = MyHTMLParser()
------------------------------------------------------------
parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ''"http://www.w3.org/TR/html4/strict.dtd">')
<PRINT1>
Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
parser.feed('<img src="python-logo.png" alt="The Python logo">')
<PRINT2>
Start tag: img
attr: ('src', 'python-logo.png')
attr: ('alt', 'The Python logo')
parser.feed('<h1>Python</h1>')
<PRINT3>
Start tag: h1
Data : Python
End tag : h1
https://stackless.readthedocs.io/en/2.7-slp/library/htmlparser.html#module-HTMLParser