lyngsat crawler |

主要用到 requests 和 bs4 这两个包

import requests
import bs4

proxy = {"http":"http://127.0.0.1:8787","https":"https://127.0.0.1:8787"}

url = 'http://www.lyngsat.com/asia.html'

#results = [[data.get_text() for data in row.find_all('td')] for row in allRows]

def get_info_from_table(table):
    rowspan_count = 0
    current_tr = 0
    trs = table.find_all('tr')

    for i in range(0, len(trs))[2:-1]:
        #print i
        if rowspan_count != 0:
            td = trs[i].find_all('td')
            if i == current_tr + 1:
                print "\t" + td[1].getText()
            else:
                print "\t" + td[0].getText()
            rowspan_count = rowspan_count - 1
            continue

        for td in trs[i].find_all('td'):
            if td.has_attr("rowspan"):
                rowspan_count = int(td["rowspan"]) - 1
                current_tr = i
                #print "rowspan_count : %d" % (rowspan_count)

        td = trs[i].find_all('td')
        #print td[1].b.getText() + " " + td[6].getText()
        sym = td[6].getText().split("-")[0]
        print "\n" + td[1].b.getText().strip(' ') + " " + sym
        print "\t" + td[3].getText()

def get_one_satellite(url):
    print "Start get data from %s" % (url)
    response = requests.get(url, proxies = proxy, verify=False)
    soup = bs4.BeautifulSoup(response.content, "lxml")
    #print soup.prettify()

    for table in soup.find_all('table', width=720, cellspacing=0, cellpadding=0):
        for tr in table.find_all('tr')[0:len(table.find_all('tr'))-1]:
            for td in tr.find_all('td',colspan=10, align="center"):
                print td.string
                get_info_from_table(table)
                print 

    #tables[6].find_all('tr')[2].find_all('td')[1]


def get_urls():
    response = requests.get(url, proxies = proxy, verify=False)
    soup = bs4.BeautifulSoup(response.content, "lxml")
    #print soup.prettify()

    #links = [a.attrs.get('href') for a in soup.select('tr a[href]')]
    #for link in soup.find_all('a'):
    #    print(link.get('href'))

    #tables = soup.find_all('table', align="center", width=720)
    #for link in tables[0].find_all('a'):
    #    print link.string
    #    print(link.get('href'))

    #tables = soup.find_all('table', align="center", width=720)
    #tds = tables[0].find_all('td', width=70)
    #for i in range(0, len(tds)):
    #    link = tds[i].find_all('a')
    #    print(link[0].get('href'))

    #tables = soup.find_all('table', align="center", width=720)
    #tds = tables[0].find_all('td', width=70)
    #for td in tds:
    #    for link in td.find_all('a'):
    #        print link.get('href')

    for table in soup.find_all('table', align="center", width=720):
        for td in table.find_all('td', width=70):
            for link in td.find_all('a'):
                get_one_satellite(link.get('href'))


def main():
    get_urls()

if __name__ == "__main__":
    main()