Sysops Notepad

[Python] HTML 파싱 하기 본문

업무/dev

[Python] HTML 파싱 하기

sysops 2019. 3. 11. 17:12
728x90

HTML 파싱 하기


<예제1>

from HTMLParser import HTMLParser


# create a subclass and override the handler methods

class MyHTMLParser(HTMLParser):

    def handle_starttag(self, tag, attrs):

        print "Encountered a start tag:", tag


    def handle_endtag(self, tag):

        print "Encountered an end tag :", tag


    def handle_data(self, data):

        print "Encountered some data  :", data


# instantiate the parser and fed it some HTML

parser = MyHTMLParser()

parser.feed('<html><head><title>Test</title></head>'

            '<body><h1>Parse me!</h1></body></html>')

------------------------------------------------------------


<PRINT>

Encountered a start tag: html

Encountered a start tag: head

Encountered a start tag: title

Encountered some data  : Test

Encountered an end tag : title

Encountered an end tag : head

Encountered a start tag: body

Encountered a start tag: h1

Encountered some data  : Parse me!

Encountered an end tag : h1

Encountered an end tag : body

Encountered an end tag : html




------------------------------------------------------------

<예제2>

from HTMLParser import HTMLParser

from htmlentitydefs import name2codepoint


class MyHTMLParser(HTMLParser):

    def handle_starttag(self, tag, attrs):

        print "Start tag:", tag

        for attr in attrs:

            print "     attr:", attr


    def handle_endtag(self, tag):

        print "End tag  :", tag


    def handle_data(self, data):

        print "Data     :", data


    def handle_comment(self, data):

        print "Comment  :", data


    def handle_entityref(self, name):

        c = unichr(name2codepoint[name])

        print "Named ent:", c


    def handle_charref(self, name):

        if name.startswith('x'):

            c = unichr(int(name[1:], 16))

        else:

            c = unichr(int(name))

        print "Num ent  :", c


    def handle_decl(self, data):

        print "Decl     :", data


parser = MyHTMLParser()

------------------------------------------------------------


parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ''"http://www.w3.org/TR/html4/strict.dtd">')


<PRINT1>

Decl     : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"



parser.feed('<img src="python-logo.png" alt="The Python logo">')


<PRINT2>

Start tag: img

     attr: ('src', 'python-logo.png')

     attr: ('alt', 'The Python logo')



parser.feed('<h1>Python</h1>')


<PRINT3>

Start tag: h1

Data     : Python

End tag  : h1



https://stackless.readthedocs.io/en/2.7-slp/library/htmlparser.html#module-HTMLParser



728x90
Comments