Files
clients/WebBasedCrawler/base/dbdata.py
mjjo 3d5e2d0c98 - 트위터 크롤러 수정
- 중복 제거 후 insert
  - proxy.txt가 모두 만료되면 db 사용
  - proxy db에서 중복 제거해서 가져오기
  - 프록시 문제로 페이지 요청 시 0.1초 딜레이
  - 크롤러 stop 동작하도록
  - realtime 적용
2017-07-28 14:29:05 +09:00

84 lines
2.4 KiB
Python

from pymysql.connections import Connection
import datetime
from numbers import Number
class DataDBRow:
def __init__(self):
self.platform_name = None
self.platform_form = None
self.platform_title = None
self.article_form = None
self.article_parent = None
self.article_id = None
self.article_nickname = None
self.article_title = None
self.article_data = None
self.article_url = None
self.article_hit = 0
self.article_date = None
self.article_order = 0
self.article_profile = None
self.article_profileurl = None
self.platform_id = None
self.keyword_id = -1
self.reply_url = None
self.etc = None
def get_keys(self):
inst = DataDBRow()
keys = ()
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
keys += key,
return keys
def get_values(self, conn, db_num):
inst = DataDBRow()
values = ()
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
value = self.__dict__[key]
if isinstance(value, Number):
values += str(value),
elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8')),
else:
values += conn.escape(value),
return values
def get_delete_query(self, db_num):
query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url)
return query
def get_insert_query(self, conn, db_num):
inst = DataDBRow()
keys = ''
values = ''
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
if len(keys) > 0:
keys += ', '
values += ', '
keys += key
value = self.__dict__[key]
if isinstance(value, Number):
values += str(value)
elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8'))
else:
values += conn.escape(value)
query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values)
return query