车型数据库
用最简单的爬虫获取汽车之家上各个型号汽车的配置#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import httplib
import random
import threading
import time
import sqlite3
dbpath = '/Users/baidu/Desktop/cardata'
params = []
def str2unicode(mustr):
return unicode(mustr, 'utf-8')
def unicode2str(mstr):
return mstr.encode('utf-8')
def runRequest(host, path, method, headers, body, ishttps=True, port=80):
if ishttps:
conn = httplib.HTTPSConnection(host)
else:
conn = httplib.HTTPConnection(host)
conn.request(method, path, headers=headers, body=body)
response = json.loads(conn.getresponse().read())
return response
def doPost(id):
try:
response = runRequest('cars.app.autohome.com.cn',
'/cfg_v8.2.0/cars/speccompare.ashx?specids=%d' % id,
'GET',
{},
'',
ishttps=True) # id = 18~
if len(response['result']['paramitems']) != 0:
paramitems = response['result']['paramitems']
cardict = {}
for itemr in paramitems:
for items in itemr['items']:
try:
key = items['name']
val = items['modelexcessids']['value']
if val == u'-':
val = u''
cardict = val
except:
pass
seg1 = 'insert into cardata('
seg2 = ' values('
tups = []
for item in cardict:
seg1 = seg1 + '"' + item + '",'
seg2 = seg2 + '?,'
if type(cardict) == int:
print cardict
tups.append(cardict)
if seg1[-1] == ',':
seg1 = seg1[: len(seg1) - 1]
if seg2[-1] == ',':
seg2 = seg2[: len(seg2) - 1]
seg1 = seg1 + ')'
seg2 = seg2 + ')'
tups = tuple(tups)
params.append({'sql' : seg1 + seg2, 'data' : tups})
print '%d handled' % id
except Exception as e:
print e
def updateDb():
while True:
if len(params) != 0:
try:
dbconn = sqlite3.connect(dbpath)
cursor = dbconn.cursor()
for param in params:
cursor.execute(param['sql'], param['data'])
dbconn.commit()
cursor.close()
dbconn.close()
except Exception as e:
print e
time.sleep(10)
class threadPool(object):
workers = 10
jobs = []
def __init__(self):
for index in range(0, self.workers):
self.jobs.append(None)
def add(self, job):
while True:
for index in range(0, self.workers):
if self.jobs is None or not self.jobs.isAlive():
self.jobs = job
job.start()
return
time.sleep(0.5)
def end(self):
for job in self.jobs:
if job is not None:
job.join()
if __name__ == '__main__':
lock = threading.Lock()
pool = threadPool()
threading.Thread(target=updateDb, args=()).start()
# 18
for num in range(18, 10000):
pool.add(threading.Thread(target=doPost, args=(num,)))
pool.end()
time.sleep(10)
附件为sqlite数据库,下面来进行简单分析:
这!是!python!
从前面几句import就让人感受到它的库的丰富
页:
[1]