relevant library

  • urllib2
  • scrapy
  • beautifulsoup
  • requests

Quick start

use requests and beautifulsoup:

  • import library
 import requests
 import bs4
# use URL
 url = '' # wordpress blog

 # requests html
 response = requests.get(url)
 soup = bs4.BeautifulSoup(response.text, 'html.parser')
# demo print
 print soup.prettify
 print soup.title
 print soup.head
  • for any <xxx> tags
 print soup.find_all('h1')
 print soup.find_all('p')
 print soup.find_all('button')
 print soup.find_all('title')
 print soup.find(id="post-870")
 print soup.find_all('a')[2]
  • find all the articles
print soup.find_all('article')[2]