1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
|
import requests from urllib import parse import execjs import pymysql import re import json import time
url='https://api.jk.cn/m.api'
headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36', 'Host':'api.jk.cn', 'Origin':'https://www.jk.cn', 'Referer':'https://www.jk.cn/galaxy/index.html', 'content-type':'application/x-www-form-urlencoded; charset=UTF-8', } data_list={ '_mt':'skydive.infoAndVideoListSearchForH5', 'serverVersion':'V2', 'position':'15000', 'keyword':'心脏病', '_sm':'md5', '_chl':'android|WAP', }
data_detail={ '_mt':'headline.getShareHeadlineDetail', '_sm':'md5', '_sv':'null', }
def generateSig(e): n='' r=[] if '_sig' in e: del e['_sig'] for i in e: r.append(i) r.sort() for i in range(0,len(r)): n=n+r[i]+'='+str(e[r[i]]) n=n+'jk.pingan.com' hs=open('./js/hs.js','r',encoding='UTF-8') line=hs.readline() hs_str='' while line: hs_str=hs_str+line line=hs.readline() ctx=execjs.compile(hs_str) e['_sig']=ctx.call('_',n) def openArticle(id): content_url='https://jkcdn.pajk.com.cn/' data_detail['infoId']=id j=execjs.get() data_detail['_st']=j.eval('Number(new Date)') generateSig(data_detail) r=requests.post(url,headers=headers,data=data_detail) r_json=r.json()
if r_json['stat']['code']!=0: print('fail:') print(r_json['stat']) return contentTfs=r_json['content'][0]['contentTfs'] content=requests.get(content_url+contentTfs) pattern = re.compile(r'img.*src="(.*?)"') imgs_url=pattern.findall(content.text) if len(imgs_url)>3: imgs_url=imgs_url[:3] imgs_json=json.dumps(imgs_url) title=r_json['content'][0]['headlineInfo']['title'] nick=r_json['content'][0]['accountInfo']['nick'] db=pymysql.connect('localhost','root','root','health_helper') cursor = db.cursor() sql='''INSERT INTO hh_article(a_title,a_from,a_content,a_type,a_click,a_images) VALUES('%s','%s','%s',%s,0,'%s')''' % (title,nick,content.text,len(imgs_url)-1 if(len(imgs_url)>0) else len(imgs_url),imgs_json) try: cursor.execute(sql) db.commit() except: print('insert data error') db.rollback() db.close()
def main(): pageNo=0 pageSize=10 data_list['pageSize']=str(pageSize) while True: data_list['pageNo']=str(pageNo) generateSig(data_list) r=requests.post(url,headers=headers,data=data_list) r_json=r.json() if r_json['stat']['code']!=0: print('fail') return item_arr=r_json['content'][0]['doc'] for item in item_arr: if item['headLineType']=='10000': print('视频类型头条:'+item['title']+'--pass') continue print('正在爬取:'+item['title']) openArticle(item['headlineId']) time.sleep(5) pageNo=pageNo+1 time.sleep(5)
if __name__ == '__main__': main()
|