欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

使用python解析xml成对应的html示例分享

程序员文章站 2023-04-07 22:22:44
sax将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。 复制代码 代码如下:#!/usr/bin/env...

sax将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

复制代码 代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
#   程序:xml解析器
#   版本:01.0
#   作者:mupeng
#   日期:2013-12-18
#   语言:python 2.7
#   功能:将xml解析成对应的html
#   注解:该程序用xml.sax模块的parse函数解析xml,并生成事件
#   继承contenthandler并重写其事件处理函数
#   dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import contenthandler
from xml.sax import parse

class dispatcher:
    def dispatch(self, prefix, name, attrs=none):
        mname = prefix + name.capitalize()
        dname = 'default' + prefix.capitalize()
        method = getattr(self, mname, none)
        if callable(method): args = ()
        else:
            method = getattr(self, dname, none)
            #args = name
        #if prefix == 'start': args += attrs
        if callable(method): method()

    def startelement(self, name, attrs):
        self.dispatch('start', name, attrs)

    def endelement(self, name):
        self.dispatch('end', name)

class website(dispatcher, contenthandler):

    def __init__(self):
        self.fout = open('ddt_sax.html', 'w')
        self.imagein = false
        self.desflag = false
        self.item = false
        self.title = ''
        self.link = ''
        self.guid = ''
        self.url = ''
        self.pubdate = ''
        self.description = ''
        self.temp = ''
        self.prx = ''
    def startchannel(self):

        self.fout.write('''<html>\n<head>\n<title> rss-''')

    def endchannel(self):
       self.fout.write('''
                    <tr><td height="20"></td></tr>
                    </table>
                    </center>
                    <script>
    function  gettimediff(str)
    {
     if(str == '')
     {
      return '';
     }

     var pubdate = new date(str);
     var nowdate = new date();
     var diffmilseconds = nowdate.valueof()-pubdate.valueof();
     var days = diffmilseconds/86400000;
     days = parseint(days);

     diffmilseconds = diffmilseconds-(days*86400000);
     var hours = diffmilseconds/3600000;
     hours = parseint(hours);

     diffmilseconds = diffmilseconds-(hours*3600000);
     var minutes = diffmilseconds/60000;
     minutes = parseint(minutes);

     diffmilseconds = diffmilseconds-(minutes*60000);
     var seconds = diffmilseconds/1000;
     seconds = parseint(seconds);

     var returnstr = "±±¾©·¢²¼ê±¼ä£º" + pubdate.tolocalestring();

     if(days > 0)
     {
      returnstr = returnstr + " £¨¾ààëïöôú" + days + "ìì" + hours + "ð¡ê±" + minutes + "·ööó£©";
     }
     else if (hours > 0)
     {
      returnstr = returnstr + " £¨¾ààëïöôú" + hours + "ð¡ê±" + minutes + "·ööó£©";
     }
     else if (minutes > 0)
     {
      returnstr = returnstr + " £¨¾ààëïöôú" + minutes + "·ööó£©";
     }

     return returnstr;

    }

    function getspantext()
    {
     var pubdate;
     var pubdatearray;
     var spanarray = document.getelementsbytagname("span");

     for(var i = 0; i < spanarray.length; i++)
     {
      pubdate = spanarray[i].innerhtml;
      document.getelementsbytagname("span")[i].innerhtml = gettimediff(pubdate);   
     }
    }

    getspantext();
   </script>
                </body>
                </html>
                ''')
       self.fout.close()

    def characters(self, chars):
        if chars.strip():
            #chars = chars.strip()
            self.temp += chars
            #print self.temp

      
    def starttitle(self):

        if self.item:
            self.fout.write('''
                        <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<b>
                    ''')

    def endtitle(self):

        if not self.imagein and not self.item:
            self.title = self.temp
            self.temp = ''
            self.fout.write(self.title.encode('gb2312'))

            #self.title = self.temp
            self.fout.write('''
                </title>\n</head>\n<body>\n<center>\n
                <script>\n

                        function copylink()
                        {
                                clipboarddata.setdata("text",window.location.href);
                                alert("rssá´½óòñ¾­¸´öæµ½¼ôìù°å");
                        }

                        function subscibelink()
                        {
                                var str = window.location.pathname;
                                while(str.match(/^\//))
                                {
                                        str = str.replace(/^\//,"");
                                }
                                window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");

                        }
                        </script>\n
                <table width="750" cellpadding="0" cellspacing="0">\n
                <tr>\n
                <td align="right" style="padding-right:15px;" valign="bottom">\n
            ''')

        if self.item:
            self.title = self.temp
            self.temp = ''
            self.fout.write(self.title.encode('gb2312'))
            self.fout.write('''
                        </b>
                        </td>
                        </tr>
                        <tr bgcolor="#eeeeee">
                        <td style="padding-left:5px;">
                        ''')

    def startimage(self):
        self.imagein = true

    def endimage(self):
        self.imagein = false

    def startlink(self):
        if self.imagein:
            self.fout.write('''<a href=" ''')

           
    def endlink(self):
        self.link = self.temp
        self.temp = ''
        if self.imagein:
            self.fout.write(self.link.encode('gb2312'))
            self.fout.write('''" target="_blank">\n ''')
        elif self.item:
            #self.link = self.temp
            pass
        else:
            self.fout.write(self.link)
            self.fout.write(''' " target="
      _blank
     "> ''')
            self.fout.write(self.title.encode('gb2312'))
            self.fout.write(''' </a></b></td>
                            </tr>
                            <tr><td colspan="2" align="center">
                            ''')
            self.fout.write(self.description.encode('gb2312'))
            self.fout.write('''
                        </td></tr>
                        <tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copylink();">¸´öæ´ëò³á´½ó</a>                <a href="javascript:subscibelink();">îòòªç¶èë¸ãðâîåáð±íµ½îòµäò³ã棨¼òµ¥¡¢¿ìëù¡¢êµê±¡¢ãâ·ñ£©</a></b></td></tr>
                        </table>
                        <table width="750" cellpadding="0" cellspacing="0">
                            ''')

    def starturl(self):
        if self.imagein:
            self.fout.write('''<img src=" ''')
    def endurl(self):
        self.url = self.temp
        self.temp = ''
        if self.imagein:
            self.fout.write(self.url.encode('gb2312'))
            self.fout.write('''" border="0">\n
                            </a>
                            </td>
                            <td align="left" valign="bottom" style="padding-bottom:8px;"><b><a href="
                            ''')
        if self.item:
            #self.url = self.temp
            pass

    def defaultstart(self):
        pass
    def defaultend(self):
        self.temp = ''
    def startdescription(self):
        pass
    def enddescription(self):
        self.description = self.temp
        self.temp = ''
        if self.item:
            #self.fout.write('¡¡¡¡')
            self.fout.write(self.description.encode('gb2312'))

    def endguid(self):
        self.guid = self.temp
    def endpubdate(self):
        if not self.temp.startswith('http'):
         self.pubdate = self.temp
         self.temp = ''
        else:
            self.pubdate = ''
    def startitem(self):
        self.item = true
    def enditem(self):
        self.item = false
        self.fout.write('''
                            </td>
                            </tr>
                            <tr bgcolor="#eeeeee">
                            <td style="padding-top:5px;padding-left:5px;">
                            <a href="''')
        self.fout.write(self.link)
        self.fout.write(''' " target="_blank"> ''')
        self.fout.write(self.guid)
        self.fout.write('''
                        </a>
                        </td>
                        </tr>
                        <tr bgcolor="#eeeeee">
                        <td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')
        self.fout.write(self.pubdate)
        self.fout.write('''</span></td>
                        </tr>
                        <tr height="10"><td></td></tr>''')

#程序入口
if __name__ == '__main__':
    parse('ddt.xml', website())