1 #!/usr/bin/env python 2 # coding: utf-8 3 4 5 import requests 6 import pandas as pd 7 import json,random,time,datetime 8 9 # userAgent 10 userAgent = [ 11 "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", 12 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 13 "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0", 14 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10", 15 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 16 "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 17 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", 18 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17" 19 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 20 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 21 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 22 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" 23 ] 24 25 # get city 26 def getCityMsg(): 27 headers = { 28 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 29 "Referer": "https://flights.ctrip.com/itinerary", 30 "Content-Type": "application/json" 31 } 32 url = 'https://flights.ctrip.com/itinerary/api/poi/get' 33 r = requests.get(url=url,headers=headers).text 34 # print(len(r)) 35 # get city msg 36 city = {} 37 city_load = json.loads(r).get('data') 38 for data in city_load.keys(): 39 if data != '热门': 40 tmpdata = city_load.get(data) 41 for i in tmpdata: 42 # print(i) # A 43 for k in tmpdata.get(i): 44 name = k.get('data').split('|') 45 cityNumId = name[2] 46 cityId = name[3] 47 cityName = name[1].split('(')[0] 48 city[cityName] = [cityId, cityNumId] 49 return city 50 51 # 生成自今日至往后7天日期 52 def get_date(): 53 dateList = [] # 存放时间list 54 formatDate = datetime.datetime.now() # 生成今日的格式化时间 55 strDate = formatDate.strftime('%Y-%m-%d') # 生成字符串日期 56 stpDate = datetime.datetime.strptime(strDate,'%Y-%m-%d') # 将字符串转为日期格式的日期 57 for i in range(7): 58 stpDate += datetime.timedelta(days=+1) # 日期叠加1 59 dateList.append(datetime.datetime.strftime(stpDate,'%Y-%m-%d')) # 放入list 60 return dateList 61 62 # get page text:routeList 63 def get_routeList(headers, load_json): 64 response = requests.post(url = "https://flights.ctrip.com/itinerary/api/12808/products",data=json.dumps(load_json), headers = headers).text 65 result = json.loads(response)["data"].get('routeList') 66 if result is not None: 67 return json.loads(response)["data"].get('routeList') 68 else: 69 print('Get 【{} --> {}】 Page is failed !'.format(load_json.get('airportParams')[0].get('dcityname'), load_json.get('airportParams')[0].get('acityname'))) 70 print('休息30s后再来……') 71 time.sleep(30) 72 get_routeList(headers, load_json) 73 74 # get Data 75 def get_data(index, df, routeList): 76 for i, route in enumerate(routeList): 77 if route.get('routeType') == 'Flight': # 只要航班 78 index += 1 79 # route is dict 80 # we need route inside legs, legs is list, but its lengths is 1 81 # so we should legs[0], legs[0] is dict 82 83 # flight 84 flight = route.get('legs')[0].get('flight') # dict 85 # cabins 86 # cabins 里面又有不同长度的list ,因此考虑将cabins单独作为一个表 87 # cabins = route.get('legs')[0].get('cabins') # list 88 # characteristic 89 # characteristic = route.get('legs')[0].get('characteristic') # dict 90 91 #### about flight 92 if flight is not None: 93 # common attr 94 df.loc[index,'airlineCode'] = flight.get('airlineCode') 95 df.loc[index,'AirlineName'] = flight.get('airlineName') 96 df.loc[index,'durationDays'] = flight.get('durationDays') 97 df.loc[index,'flightNumber'] = flight.get('flightNumber') 98 df.loc[index,'mealFlag'] = flight.get('mealFlag') 99 df.loc[index,'mealType'] = flight.get('mealType') 100 df.loc[index,'comfort'] = flight.get('comfort') 101 df.loc[index,'craftKind'] = flight.get('craftKind') 102 df.loc[index,'craftTypeCode'] = flight.get('craftTypeCode') 103 df.loc[index,'craftTypeKindDisplayName'] = flight.get('craftTypeKindDisplayName') 104 df.loc[index,'craftTypeName'] = flight.get('craftTypeName') 105 df.loc[index,'delayedTime'] = flight.get('delayedTime') 106 df.loc[index,'oilFee'] = flight.get('oilFee') 107 df.loc[index,'punctualityRate'] = flight.get('punctualityRate') 108 df.loc[index,'sharedFlightName'] = flight.get('sharedFlightName') 109 df.loc[index,'sharedFlightNumber'] = flight.get('sharedFlightNumber') 110 df.loc[index,'specialCraft'] = flight.get('specialCraft') 111 df.loc[index,'stopInfo'] = flight.get('stopInfo') 112 df.loc[index,'stopTimes'] = flight.get('stopTimes') 113 df.loc[index,'tax'] = flight.get('tax') 114 # arrival 115 df.loc[index,'arrivalairportName'] = flight.get('arrivalAirportInfo').get('airportName') 116 df.loc[index,'arrivalairportTlc'] = flight.get('arrivalAirportInfo').get('airportTlc') 117 df.loc[index,'arrivalcityName'] = flight.get('arrivalAirportInfo').get('cityName') 118 df.loc[index,'arrivalcityTlc'] = flight.get('arrivalAirportInfo').get('cityTlc') 119 df.loc[index,'arrivalTerminalName'] = flight.get('arrivalAirportInfo').get('terminal').get('name') 120 df.loc[index,'arrivalDate'] = flight.get('arrivalDate') 121 # departure 122 df.loc[index,'departureairportName'] = flight.get('departureAirportInfo').get('airportName') 123 df.loc[index,'departureairportTlc'] = flight.get('departureAirportInfo').get('airportTlc') 124 df.loc[index,'departureCityName'] = flight.get('departureAirportInfo').get('cityName') 125 df.loc[index,'departureCityTlc'] = flight.get('departureAirportInfo').get('cityTlc') 126 df.loc[index,'departureTerminalName'] = flight.get('departureAirportInfo').get('terminal').get('name') 127 df.loc[index,'departureDate'] = flight.get('departureDate') 128 129 #### characteristic : charactor 130 # characteristic:charactor 131 charactor = route.get('legs')[0].get('characteristic') # dict 132 if charactor is not None: 133 df.loc[index, 'businessAircraft'] = charactor.get('businessAircraft') 134 df.loc[index, 'discountAmount'] = charactor.get('discountAmount') 135 df.loc[index, 'discountShowType'] = charactor.get('discountShowType') 136 df.loc[index, 'flyMan'] = charactor.get('flyMan') 137 df.loc[index, 'groupTicketPrice'] = charactor.get('groupTicketPrice') 138 df.loc[index, 'hotFlight'] = charactor.get('hotFlight') 139 df.loc[index, 'hx'] = charactor.get('hx') 140 df.loc[index, 'infantSoldOut'] = charactor.get('infantSoldOut') 141 df.loc[index, 'lowPriceDiscount'] = charactor.get('lowPriceDiscount') 142 df.loc[index, 'lowestBabyCfPrice'] = charactor.get('lowestBabyCfPrice') 143 df.loc[index, 'lowestBabyPrice'] = charactor.get('lowestBabyPrice') 144 df.loc[index, 'lowestCfPrice'] = charactor.get('lowestCfPrice') 145 df.loc[index, 'lowestChildAdultCfPrice'] = charactor.get('lowestChildAdultCfPrice') 146 df.loc[index, 'lowestChildAdultPrice'] = charactor.get('lowestChildAdultPrice') 147 df.loc[index, 'lowestChildCfPrice'] = charactor.get('lowestChildCfPrice') 148 df.loc[index, 'lowestChildPrice'] = charactor.get('lowestChildPrice') 149 df.loc[index, 'lowestPrice'] = charactor.get('lowestPrice') 150 df.loc[index, 'promotion'] = charactor.get('promotion') 151 df.loc[index, 'providerHx'] = charactor.get('providerHx') 152 df.loc[index, 'roundTripDiscounts'] = charactor.get('roundTripDiscounts') 153 for i, stdPrice in enumerate(charactor.get('standardPrices')): 154 diffCabinCla = stdPrice.get('cabinClass') 155 df.loc[index, 'price' + diffCabinCla] = stdPrice.get('price') 156 df.loc[index, 'superFlyMan'] = charactor.get('superFlyMan') 157 df.loc[index, 'weight'] = charactor.get('weight') 158 159 160 #### carbins 161 # 由于cabins 里面又有不同长度的list, 会出现很多的空值列。因此考虑将cabins单独作为一个表 162 # cabins = route.get('legs')[0].get('cabins') # list 163 # for i, cabin in enumerate(cabins): 164 # carbin = cabin.get('cabinClass') + str(i) 165 # diffCabin = cabin.get('cabinClass') + str(i) # carbins is list, have more, we need diff them 166 # df.loc[index, 'compositionPrice' + diffCabin] = cabin.get('price').get('compositionPrice') 167 # df.loc[index, 'discount' + diffCabin] = cabin.get('price').get('discount') 168 # df.loc[index, 'discountAmount' + diffCabin] = cabin.get('price').get('discountAmount') 169 # df.loc[index, 'discountShowType' + diffCabin] = cabin.get('price').get('discountShowType') 170 # df.loc[index, 'favorablePrice' + diffCabin] = cabin.get('price').get('favorablePrice') 171 # df.loc[index, 'fdPrice' + diffCabin] = cabin.get('price').get('fdPrice') 172 # df.loc[index, 'originalPrice' + diffCabin] = cabin.get('price').get('originalPrice') 173 # df.loc[index, 'pcPrice' + diffCabin] = cabin.get('price').get('pcPrice') 174 # df.loc[index, 'price' + diffCabin] = cabin.get('price').get('price') 175 # df.loc[index, 'rate' + diffCabin] = cabin.get('price').get('rate') 176 # df.loc[index, 'salePrice' + diffCabin] = cabin.get('price').get('salePrice') 177 # df.loc[index, 'serviceCharge' + diffCabin] = cabin.get('price').get('serviceCharge') 178 179 # # refundEndorse 180 # df.loc[index, 'changeNote' + diffCabin] = cabin.get('refundEndorse').get('changeNote') 181 # df.loc[index, 'changeRuleFlag' + diffCabin] = cabin.get('refundEndorse').get('changeRuleFlag') 182 # df.loc[index, 'endorseNote' + diffCabin] = cabin.get('refundEndorse').get('endorseNote') 183 # df.loc[index, 'minEndorseFee' + diffCabin] = cabin.get('refundEndorse').get('minEndorseFee') 184 # df.loc[index, 'minRefundFee' + diffCabin] = cabin.get('refundEndorse').get('minRefundFee') 185 # df.loc[index, 'refundNote' + diffCabin] = cabin.get('refundEndorse').get('refundNote') 186 # df.loc[index, 'refundRuleFlag' + diffCabin] = cabin.get('refundEndorse').get('refundRuleFlag') 187 # df.loc[index, 'remark' + diffCabin] = cabin.get('refundEndorse').get('remark') 188 # df.loc[index, 'serviceLevel' + diffCabin] = cabin.get('refundEndorse').get('serviceLevel') 189 return (index, df) 190 191 192 # main function 193 def main(index, df,city): 194 # 初始化 时间 195 flightDates = get_date() 196 for flightDate in flightDates: # 起飞日期 197 print(flightDate, end= '\t') 198 # (广州,(can,32)) 199 for (fromCityName, fromCityId) in city.items(): # 起飞城市 200 for (toCityName, toCityId) in city.items(): # 降落城市 201 if fromCityName != toCityName: 202 # headers 203 headers = { 204 "User-Agent": random.choice(userAgent), 205 "origin": "https://flights.ctrip.com", 206 "content-type": "application/json" 207 } 208 # 加载不同 load_json 209 load_json = { 210 "airportParams":[ 211 {"dcity":fromCityId[0],"dcityname":fromCityName,"acity":toCityId[0],"acityname":toCityName,"date":flightDate,"dcityid":fromCityId[1],"acityid":toCityId[1]} 212 ], 213 "classType": "ALL", 214 "date": flightDate, 215 "flightWay": "Oneway", 216 "hasBaby": False, 217 "hasChild": False, 218 "searchIndex": 1, 219 "token": "3481e1f047cee3eb638f2435b7c1b860" 220 } 221 # routeList 222 routeList = get_routeList(headers, load_json) 223 # get_data 224 (index, df) = get_data(index, df, routeList) 225 print(index,df.shape, end='\t') 226 time.sleep(random.choice(range(5, 10))) 227 time.sleep(randomm.choice(range(5, 10))) 228 print('【{}】起飞,抓完!'.format(fromCityName)) 229 time.sleep(randomm.choice(range(60, 90))) 230 print('起飞日期:{},抓完,写入文件!'.format(fromCityName)) 231 df.to_csv('【{}】爬取:【{}】起飞航班.csv'.format(flightDates[0], flightDate),index=False) 232 return (index, df) 233 234 235 236 if __name__ == "__main__": 237 df = pd.DataFrame() # 存数df 238 index = 0 # 计数索引 239 # getCityMsg 240 city = getCityMsg() 241 (index, df) = main(index, df, city)
注意爬取速度和更换load_json中的token