scrape.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import urllib3
  2. from lxml import etree
  3. import csv
  4. list_page = 'https://en.wikipedia.org/wiki/The_World%27s_Billionaires'
  5. user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ..'}
  6. http = urllib3.PoolManager(10, headers=user_agent)
  7. curr_page = http.request('GET', list_page)
  8. htmlparser = etree.HTMLParser()
  9. tree = etree.HTML(curr_page.data, htmlparser)
  10. table = []
  11. for x in range(2, 21):
  12. for y in range(2, 12):
  13. number = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[1]/text()' % (str(x), str(y)))
  14. print(number)
  15. num = number[0].strip()
  16. name = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[2]/span/span/span/a/text()' % (str(x), str(y)))
  17. if name == []:
  18. name = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[2]/a/text()' % (str(x), str(y)))
  19. if len(name) == 2:
  20. person = name[0] + ' and ' + name[1]
  21. else:
  22. person = name[0]
  23. else:
  24. person = name[0]
  25. money = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[3]/text()' % (str(x), str(y)))
  26. worth = money[0].strip()
  27. # print(worth)
  28. age = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[4]/text()' % (str(x), str(y)))
  29. old = age[0]
  30. origin = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[5]/a/text()' % (str(x), str(y)))
  31. locale = origin[0]
  32. source = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[6]/a/text()' % (str(x), str(y)))
  33. if source == []:
  34. source = tree.xpath('//*[@id="mw-content-text"]/div/table[%s]/tr[%s]/td[2]/span/span/span/a/text()' % (str(x), str(y)))
  35. money_maker = source[0]
  36. year = tree.xpath('//*[@id="mw-content-text"]/div/h3[%s]/span/text()' % (str(x-1)))[0]
  37. table.append((year, num, person, worth, old, locale, money_maker))
  38. with open('output.csv', 'w') as csvfile:
  39. w = csv.writer(csvfile)
  40. for x in table:
  41. print(list(x))
  42. w.writerow(list(x))