#Date:2019.5.20
#author : isenwen
#describe : 古诗文网爬虫
import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}



def parse_page(url):
    response=requests.get(url,headers=headers).text
    titles=re.findall('<div\sclass="cont">.*?<b>(.*?)</b>',response,re.DOTALL)
    dynasties=re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',response,re.DOTALL)
    authors=re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',response,re.DOTALL)
    content_tags=re.findall(r'<div\sclass="contson".*?>(.*?)</div>',response,re.DOTALL)
    contents=[]
    for content in content_tags:
        x=re.sub(r'<.*?>',"",content)
        contents.append(x.strip())
    poems=[]
    for value in zip(titles,dynasties,authors,contents):
        title,dynasty,author,content=value
        poem={
            "title":title,
            "dynasty":dynasty,
            "author":author,
            "content":content
        }
        poems.append(poem)
    for poem in poems:
        print(poem)
        print('='*30)
    return poems


def main(page):
    for i in range(1,page):
        url = 'https://www.gushiwen.org/default_{}.aspx'.format(page)
        parse_page(url)


if __name__ == '__main__':
    page=int(input('输入你要爬取的页数:'))
    main(page)