Files
clients/WebBasedCrawler/twitter/twdbhelper.py
mjjo 3d5e2d0c98 - 트위터 크롤러 수정
- 중복 제거 후 insert
  - proxy.txt가 모두 만료되면 db 사용
  - proxy db에서 중복 제거해서 가져오기
  - 프록시 문제로 페이지 요청 시 0.1초 딜레이
  - 크롤러 stop 동작하도록
  - realtime 적용
2017-07-28 14:29:05 +09:00

83 lines
2.5 KiB
Python

from twitter.tweet import Tweet
import multiprocessing as mp
class TwitterDBHelper:
pymysql = __import__('pymysql.cursors')
def __init__(self):
self.tweets = []
self.buffer = []
self.lock = mp.Lock()
pass
def __del__(self):
pass
def get_param(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id)
params = []
try:
conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with conn.cursor() as cursor:
cursor.execute(query)
params = cursor.fetchone()
except Exception as e:
print(e)
exit(1)
else:
conn.close()
return params
def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False):
# self.lock.acquire()
# if tweet is not None:
# self.buffer.append((tweet, db_num, ))
#
# local_buffer = None
# if len(self.buffer) >= 100 or flush:
# local_buffer = copy.deepcopy(self.buffer)
# self.buffer.clear()
# self.lock.release()
local_buffer = [(tweet, db_num, )]
if local_buffer:
while True:
try:
conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor,
connect_timeout=5)
except Exception as e:
print(e)
continue
else:
break
try:
with conn.cursor() as cursor:
for tweet, _db_num in local_buffer:
if not tweet.is_reply:
query = tweet.get_delete_query(_db_num)
cursor.execute(query)
query = tweet.get_insert_query(conn, _db_num)
cursor.execute(query)
conn.commit()
except Exception as e:
print(e)
finally:
conn.close()