宽容他人,放过自己。

极客学院swift抓取

Posted on By anchoriteFili

#!/usr/bin/python
#-*- coding: utf-8 -*-
#encoding=utf-8

import hashlib
import time
import sys
import base64
import requests
import json
reload(sys)
import re
from lxml import etree
sys.setdefaultencoding('utf8')

def erJiWithEach(string):
    cook = {
        "Cookie": "Hm_lvt_f3c68d41bda15331608595c98e9c3915=1463967026,1464589515,1465369456,1465375154; _ga=GA1.2.27552659.1463361608; looyu_id=e6c8e122f0166411f95d0a25c6a2cc0564_20001269%3A15; stat_uuid=1463361609992293413184; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154b72626b33fb-0062cf8f24bb7-48566c-fa000-154b72626b4198%22%7D; undefined=; stat_ssid=1465661524138; level_id=2; is_expire=0; domain=2090592740; code=USMC8J; Hm_lpvt_f3c68d41bda15331608595c98e9c3915=1465375759; stat_fromWebUrl=; stat_isNew=0; looyu_20001269=v%3A1e5b2c7f12dc58754a81b314f4a2f7403b%2Cref%3A%2Cr%3A%2Cmon%3Ahttp%3A//m9106.talk99.cn/monitor%2Cp0%3Ahttp%253A//www.jikexueyuan.com/course/995.html; uname=jike_1261656; uid=3654191; authcode=466bRLDaSw31dZyE395p09eKZZfg%2Bl4ZVwE6%2BeemiPh0azXwtzQMwGMMDyyFSFUu14TPg8BE7Gtr0uqru5dkI9OSnQIKQyNPVR93m61XWCJXi9XzyAd2vmOv1oLVfdIn; _99_mon=%5B0%2C0%2C0%5D"}
    url = 'http://www.jikexueyuan.com/course/151_1.html?ss=1'
    html = requests.get(string, cookies=cook)

    pythonEtree2 = etree.HTML(html.text)
    pythonLink2 = pythonEtree2.xpath('//div[@class="text-box"]/h2/a')
    for each in pythonLink2:
        #print each.xpath('@href')[0]
        #print each.xpath('text()')[0]
        #print each
        sanJiWithEach(each.xpath('@href')[0],each.xpath('text()')[0])

def sanJiWithEach(string,string1):
    cook = {
        "Cookie": "Hm_lvt_f3c68d41bda15331608595c98e9c3915=1463967026,1464589515,1465369456,1465375154; _ga=GA1.2.27552659.1463361608; looyu_id=e6c8e122f0166411f95d0a25c6a2cc0564_20001269%3A15; stat_uuid=1463361609992293413184; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154b72626b33fb-0062cf8f24bb7-48566c-fa000-154b72626b4198%22%7D; undefined=; stat_ssid=1465661524138; level_id=2; is_expire=0; domain=2090592740; code=USMC8J; Hm_lpvt_f3c68d41bda15331608595c98e9c3915=1465375759; stat_fromWebUrl=; stat_isNew=0; looyu_20001269=v%3A1e5b2c7f12dc58754a81b314f4a2f7403b%2Cref%3A%2Cr%3A%2Cmon%3Ahttp%3A//m9106.talk99.cn/monitor%2Cp0%3Ahttp%253A//www.jikexueyuan.com/course/995.html; uname=jike_1261656; uid=3654191; authcode=466bRLDaSw31dZyE395p09eKZZfg%2Bl4ZVwE6%2BeemiPh0azXwtzQMwGMMDyyFSFUu14TPg8BE7Gtr0uqru5dkI9OSnQIKQyNPVR93m61XWCJXi9XzyAd2vmOv1oLVfdIn; _99_mon=%5B0%2C0%2C0%5D"}
    url = 'http://www.jikexueyuan.com/course/995_3.html?ss=1'
    html = requests.get(string, cookies=cook)

    print string1

    # print html.text
    pythonEtree2 = etree.HTML(html.text)
    pythonLink2 = pythonEtree2.xpath('//video[@class="video-js vjs-default-skin"]/source/@src')
    for each in pythonLink2:
        #film = ''

        #savefile(string1)

        print each

        #print type(string1)
        #film = '%s %s' % each, string1
        #print film


def savefile(string):
    f = open("/Users/macbook/python/代码/电影名单.txt","a")
    f.writelines(string)
    f.close()




cook = {"Cookie": "Hm_lvt_f3c68d41bda15331608595c98e9c3915=1463967026,1464589515,1465369456,1465375154; _ga=GA1.2.27552659.1463361608; looyu_id=e6c8e122f0166411f95d0a25c6a2cc0564_20001269%3A15; stat_uuid=1463361609992293413184; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22154b72626b33fb-0062cf8f24bb7-48566c-fa000-154b72626b4198%22%7D; undefined=; stat_ssid=1465661524138; level_id=2; is_expire=0; domain=2090592740; code=USMC8J; Hm_lpvt_f3c68d41bda15331608595c98e9c3915=1465375759; stat_fromWebUrl=; stat_isNew=0; looyu_20001269=v%3A1e5b2c7f12dc58754a81b314f4a2f7403b%2Cref%3A%2Cr%3A%2Cmon%3Ahttp%3A//m9106.talk99.cn/monitor%2Cp0%3Ahttp%253A//www.jikexueyuan.com/course/995.html; uname=jike_1261656; uid=3654191; authcode=466bRLDaSw31dZyE395p09eKZZfg%2Bl4ZVwE6%2BeemiPh0azXwtzQMwGMMDyyFSFUu14TPg8BE7Gtr0uqru5dkI9OSnQIKQyNPVR93m61XWCJXi9XzyAd2vmOv1oLVfdIn; _99_mon=%5B0%2C0%2C0%5D"}
url = 'http://search.jikexueyuan.com/course/?q=swift&page=5'
html = requests.get(url,cookies = cook)

pythonEtree2 = etree.HTML(html.text)
pythonLink2 = pythonEtree2.xpath('//div[@class="course-content"]/div[@class="info"]/a/@href')
for each in pythonLink2:
    #print each
    erJiWithEach(each)