0%

python编写lyngsat爬虫小结

使用 requestsbs4 爬取 lyngsat 网页数据,生出数据库,或导出为 Excel。期间遇到一些问题,记录如下:

class

class Crawler:
    def __init__(self, a, b):
        a = a
        b = b

    def function1(self):
        pass

    def function2(self):
        pass

    def start(self):
        pass

if __name__ == "__main__":
    crawler - Crawler(a, b)
    crawler.start()

getopt

import getopt

try:
    options, args = getopt.getopt(sys.argv[1:],"ha:s:o:", ["show=", "keyword="])
except getopt.GetoptError:                                                
    usage()                                                               

for key, value in options:
    if (key == "-h"):
        pass
    elif(key == '-a'):
        area = value
    else:
        assert False, "unhandled option"

requests

import requests

try:                                                                  
    response = requests.get(url, proxies = proxy, verify=False)  
except requests.exceptions.RequestException as e:                     
    print e                                                           

pass

response.close()

bs4

import bs4

soup = bs4.BeautifulSoup(response.content, 'lxml')
for table in soup.find_all('table', align='center', width=720):
    for tr in table.find_all('tr')[1:]:
        print tr.find('td', width=180).text, tr.find('td', width=180).find('a')['href']
soup.decompose()

xlwt

import xlwt

wb = xlwt.Workbook(encoding='utf-8', style_compression=2)
ws = wb.add_sheet('sheet')

ws.col(0).width=256*8   #8 characters width
ws.col(1).width=256*10  #10 characters width

pattern = xlwt.Pattern()                                              
pattern.pattern = xlwt.Pattern.SOLID_PATTERN                          
pattern.pattern_fore_colour = xlwt.Style.colour_map[color]            
pattern.pattern_back_colour = xlwt.Style.colour_map['black']          

alignment = xlwt.Alignment()                                          
alignment.horz = alignment.HORZ_LEFT                                  
alignment.vert = alignment.VERT_CENTER                                

font = xlwt.Font()                                                    
font.name = 'Times New Roman'                                         

borders = xlwt.Borders()                                              
borders.left = 1                                                      
borders.right = 1                                                     
borders.top = 1                                                                                                            
borders.bottom = 1                                                    
borders.bottom_colour=0x3A                                            

style = xlwt.XFStyle()                                                
style.pattern=pattern                                                 
style.font = font                                                     
style.borders = borders                                               
style.alignment = alignment        

ws.write_merge(1, 1, 0, 4, 'TP', style)
ws.write_merge(1, 1, 5, 9, 'Service', style)

ws.write(row, 9, sys)
ws.write(row, 10, sym, style)

wb.save(xlsname)

re

import re
name = re.sub(r'\(.*\)', '', td_name.text)
provider_name = re.sub(r'^@ *', '', re.sub(r'\(.*\)', '', name))

struct

import struct

file = open(name, "wb")
binary += struct.pack('I', DB_STARTCODE)
binary += struct.pack('III', len(satlist), len(tplist), 0)

for sat in satlist:
    m_nTunerSelect = 0b00
    m_nLongitude = int(float(longitude)*10)&0x7ff

    if lnb.lower() == "ku":
        m_nLnbType = 0b01
        m_Lnb1 = 0b0011
        m_Lnb2 = 0b0101
        m_nSat22k = 0b10
    elif lnb.lower() == "c":
        m_nLnbType = 0b00
        m_Lnb1 = 0b0000
        m_Lnb2 = 0b0010
        m_nSat22k = 0b00
    else:
        print "Error Satellite Lnb Type"

    if direction.lower() == "e":
        m_nLongitudeDirection = 0b0
    elif direction.lower() == "w":
        m_nLongitudeDirection = 0b1
    else:
        print "Error Satellite Direction"

    m_nDiseqc12Pos = 0b00000000
    m_nDiseqc10 = 0b000
    m_nDiseqcMotor = 0b00
    m_nSat12v = 0b00

    m_nTpStartPos = tp_pos
    m_nTpNum = tp_num

    info0 = 0<<30 | m_nTunerSelect
    info1 = m_nLongitudeDirection<<31 | m_nSat22k<<29 | m_nDiseqc12Pos<<21 | m_Lnb2<<17 | m_Lnb1<<13 | m_nLnbType<<11 | m_nLongitude
    info2 = m_nDiseqc10<<29 | m_nSat12v<<28 | m_nTpStartPos<<15 | m_nTpNum<<2 | m_nDiseqcMotor
    info2 = m_nDiseqcMotor<<30 | m_nTpNum<<17 | m_nTpStartPos<<4 | m_nSat12v<<3 | m_nDiseqc10

    binary += struct.pack('16sIII', sat_name.encode('utf-8'), info0, info1, info2)
    binary += struct.pack('I', DB_ENDCODE)
    file.write(binary)

Tips

video.append(vpid.replace(u'\xa0', u' ').strip())

audio_codec = (len(info) > 2) and info[2] or ''

if 'psk' in text.lower() or ('-' in text and '/' in text) or ('-' in  text and '?' in text):
    continue 

point = (len(tr.select('td[rowspan]')) > 0) and 1 or 0

length = int(tr.find('td')['rowspan'])

profile

requests bs4 存在大量内存使用,使用相关的 python profile 工具来调试内存泄漏和优化