使用 requests
和 bs4
爬取 lyngsat
网页数据,生出数据库,或导出为 Excel
。期间遇到一些问题,记录如下:
class
class Crawler:
def __init__(self, a, b):
a = a
b = b
def function1(self):
pass
def function2(self):
pass
def start(self):
pass
if __name__ == "__main__":
crawler - Crawler(a, b)
crawler.start()
getopt
import getopt
try:
options, args = getopt.getopt(sys.argv[1:],"ha:s:o:", ["show=", "keyword="])
except getopt.GetoptError:
usage()
for key, value in options:
if (key == "-h"):
pass
elif(key == '-a'):
area = value
else:
assert False, "unhandled option"
requests
import requests
try:
response = requests.get(url, proxies = proxy, verify=False)
except requests.exceptions.RequestException as e:
print e
pass
response.close()
bs4
import bs4
soup = bs4.BeautifulSoup(response.content, 'lxml')
for table in soup.find_all('table', align='center', width=720):
for tr in table.find_all('tr')[1:]:
print tr.find('td', width=180).text, tr.find('td', width=180).find('a')['href']
soup.decompose()
xlwt
import xlwt
wb = xlwt.Workbook(encoding='utf-8', style_compression=2)
ws = wb.add_sheet('sheet')
ws.col(0).width=256*8 #8 characters width
ws.col(1).width=256*10 #10 characters width
pattern = xlwt.Pattern()
pattern.pattern = xlwt.Pattern.SOLID_PATTERN
pattern.pattern_fore_colour = xlwt.Style.colour_map[color]
pattern.pattern_back_colour = xlwt.Style.colour_map['black']
alignment = xlwt.Alignment()
alignment.horz = alignment.HORZ_LEFT
alignment.vert = alignment.VERT_CENTER
font = xlwt.Font()
font.name = 'Times New Roman'
borders = xlwt.Borders()
borders.left = 1
borders.right = 1
borders.top = 1
borders.bottom = 1
borders.bottom_colour=0x3A
style = xlwt.XFStyle()
style.pattern=pattern
style.font = font
style.borders = borders
style.alignment = alignment
ws.write_merge(1, 1, 0, 4, 'TP', style)
ws.write_merge(1, 1, 5, 9, 'Service', style)
ws.write(row, 9, sys)
ws.write(row, 10, sym, style)
wb.save(xlsname)
re
import re
name = re.sub(r'\(.*\)', '', td_name.text)
provider_name = re.sub(r'^@ *', '', re.sub(r'\(.*\)', '', name))
struct
import struct
file = open(name, "wb")
binary += struct.pack('I', DB_STARTCODE)
binary += struct.pack('III', len(satlist), len(tplist), 0)
for sat in satlist:
m_nTunerSelect = 0b00
m_nLongitude = int(float(longitude)*10)&0x7ff
if lnb.lower() == "ku":
m_nLnbType = 0b01
m_Lnb1 = 0b0011
m_Lnb2 = 0b0101
m_nSat22k = 0b10
elif lnb.lower() == "c":
m_nLnbType = 0b00
m_Lnb1 = 0b0000
m_Lnb2 = 0b0010
m_nSat22k = 0b00
else:
print "Error Satellite Lnb Type"
if direction.lower() == "e":
m_nLongitudeDirection = 0b0
elif direction.lower() == "w":
m_nLongitudeDirection = 0b1
else:
print "Error Satellite Direction"
m_nDiseqc12Pos = 0b00000000
m_nDiseqc10 = 0b000
m_nDiseqcMotor = 0b00
m_nSat12v = 0b00
m_nTpStartPos = tp_pos
m_nTpNum = tp_num
info0 = 0<<30 | m_nTunerSelect
info1 = m_nLongitudeDirection<<31 | m_nSat22k<<29 | m_nDiseqc12Pos<<21 | m_Lnb2<<17 | m_Lnb1<<13 | m_nLnbType<<11 | m_nLongitude
info2 = m_nDiseqc10<<29 | m_nSat12v<<28 | m_nTpStartPos<<15 | m_nTpNum<<2 | m_nDiseqcMotor
info2 = m_nDiseqcMotor<<30 | m_nTpNum<<17 | m_nTpStartPos<<4 | m_nSat12v<<3 | m_nDiseqc10
binary += struct.pack('16sIII', sat_name.encode('utf-8'), info0, info1, info2)
binary += struct.pack('I', DB_ENDCODE)
file.write(binary)
Tips
video.append(vpid.replace(u'\xa0', u' ').strip())
audio_codec = (len(info) > 2) and info[2] or ''
if 'psk' in text.lower() or ('-' in text and '/' in text) or ('-' in text and '?' in text):
continue
point = (len(tr.select('td[rowspan]')) > 0) and 1 or 0
length = int(tr.find('td')['rowspan'])
profile
requests
bs4
存在大量内存使用,使用相关的 python profile
工具来调试内存泄漏和优化