主要用到 requests
和 bs4
这两个包
import requests
import bs4
proxy = {"http":"http://127.0.0.1:8787","https":"https://127.0.0.1:8787"}
url = 'http://www.lyngsat.com/asia.html'
#results = [[data.get_text() for data in row.find_all('td')] for row in allRows]
def get_info_from_table(table):
rowspan_count = 0
current_tr = 0
trs = table.find_all('tr')
for i in range(0, len(trs))[2:-1]:
#print i
if rowspan_count != 0:
td = trs[i].find_all('td')
if i == current_tr + 1:
print "\t" + td[1].getText()
else:
print "\t" + td[0].getText()
rowspan_count = rowspan_count - 1
continue
for td in trs[i].find_all('td'):
if td.has_attr("rowspan"):
rowspan_count = int(td["rowspan"]) - 1
current_tr = i
#print "rowspan_count : %d" % (rowspan_count)
td = trs[i].find_all('td')
#print td[1].b.getText() + " " + td[6].getText()
sym = td[6].getText().split("-")[0]
print "\n" + td[1].b.getText().strip(' ') + " " + sym
print "\t" + td[3].getText()
def get_one_satellite(url):
print "Start get data from %s" % (url)
response = requests.get(url, proxies = proxy, verify=False)
soup = bs4.BeautifulSoup(response.content, "lxml")
#print soup.prettify()
for table in soup.find_all('table', width=720, cellspacing=0, cellpadding=0):
for tr in table.find_all('tr')[0:len(table.find_all('tr'))-1]:
for td in tr.find_all('td',colspan=10, align="center"):
print td.string
get_info_from_table(table)
print
#tables[6].find_all('tr')[2].find_all('td')[1]
def get_urls():
response = requests.get(url, proxies = proxy, verify=False)
soup = bs4.BeautifulSoup(response.content, "lxml")
#print soup.prettify()
#links = [a.attrs.get('href') for a in soup.select('tr a[href]')]
#for link in soup.find_all('a'):
# print(link.get('href'))
#tables = soup.find_all('table', align="center", width=720)
#for link in tables[0].find_all('a'):
# print link.string
# print(link.get('href'))
#tables = soup.find_all('table', align="center", width=720)
#tds = tables[0].find_all('td', width=70)
#for i in range(0, len(tds)):
# link = tds[i].find_all('a')
# print(link[0].get('href'))
#tables = soup.find_all('table', align="center", width=720)
#tds = tables[0].find_all('td', width=70)
#for td in tds:
# for link in td.find_all('a'):
# print link.get('href')
for table in soup.find_all('table', align="center", width=720):
for td in table.find_all('td', width=70):
for link in td.find_all('a'):
get_one_satellite(link.get('href'))
def main():
get_urls()
if __name__ == "__main__":
main()