import datetime import json import effect.effecterror import re from kakao.kakaoexception import NotFoundElementError from bs4 import BeautifulSoup from base.baseclasses import wait from effect.resultsender import get_settings from effect.resultsender import ResultSender from base.baseclasses import find_element_by_css_selector from base.baseclasses import enter_element try: import lxml parser_opt = 'lxml' except ImportError: parser_opt = 'html.parser' SEPERATOR = '!@#' kakaostory_url = 'https://story.kakao.com/' kakaostory_channel_url = 'https://story.kakao.com/ch/' limit_reload = 5 num_of_retry = 3 re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})" "[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") def get_date(element): """ :param element: this may be span.time element :return: 'yyyy-MM-dd hh:mm:ss' """ m = re_date.search(element.attrs.get('title', '')) \ or re_date.search(element.attrs.get('data-tooltip', '')) if m: temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) # add 12 hour when the article is written at p.m if m.group(4) == "오후" and int(m.group(5)) < 12: temp_date += datetime.timedelta(hours=12) # convert datetime.datetime to str return str(temp_date) # return invalid date instead of exception else: # raise NotFoundElementError("get_date exception") return "0000-00-00 00:00:00" class BodyCrawler(object): def __init__(self, driver): self.driver = driver self.soup = None self.section_activity = None self.set_soup_and_activity() # calling point may differ def set_soup_and_activity(self): self.soup = BeautifulSoup(self.driver.page_source, parser_opt) # There are many div.section _activity. But element we use is in div.cover_wrapper # cover_wrapper = self.soup.find('div', class_='cover_wrapper') self.section_activity = self.soup.find('div', class_='section _activity') def find_article_id(self): a = self.section_activity.find('a', class_='pf_name') href = a.attrs['href'].replace('https://story.kakao.com/', '') return href[1:] if href.startswith('/') else href def find_article_nickname(self): a = self.section_activity.find('a', class_='pf_name') return a.text def find_article_url(self): # in chrome, current_url is equal to article_url # need to check other browser return self.driver.current_url def find_article_modified_date(self): # get DOM about modified date times = None add_top = self.section_activity.find('div', class_='add_top') if add_top: times = add_top.find_all('span', class_='time') # written time is default. if the article was modified, modified time is added. # so if length of times is not equal to 2, there is only written time. if not times or len(times) < 2: return None # times[0] : written time, times[1] : modified time # times[1] structure : # check times[1].span exists if times[1].span: # before mouse over the element(tooltip), the date string is in the title attribute of span # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span m = re_date.search(times[1].span.attrs.get('title', '')) \ or re_date.search(times[1].span.attrs.get('data-tooltip', '')) if m: temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) # add 12 hour when the article is written at p.m if m.group(4) == "오후" and int(m.group(5)) < 12: temp_date += datetime.timedelta(hours=12) # convert datetime.datetime to str return str(temp_date) else: # raise NotFoundDataError('data for find_article_modified is not found') return None # return None instead of exception. else: # raise NotFoundElementError('find_article_modified DOM is missing') return None def find_article_date(self): # modified date is a higher priority than written date # modified_date = self.find_article_modified_date() # if modified_date: # return modified_date times = None # get DOMs about date add_top = self.section_activity.find('div', class_='add_top') if add_top: times = add_top.find_all('span', class_='time') else: raise NotFoundElementError("find_article_data DOM is missing : add_top") if not times: raise NotFoundElementError("find_article_data DOM is missing : time") # before mouse over the element(tooltip), the date string is in the title attribute of span # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span m = re_date.search(times[0].attrs.get('title', '')) \ or re_date.search(times[0].attrs.get('data-tooltip', '')) if m: temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) # add 12 hour when the article is written at p.m if m.group(4) == "오후" and int(m.group(5)) < 12: temp_date += datetime.timedelta(hours=12) # convert datetime.datetime to str return str(temp_date) # return invalid date instead of exception else: # raise NotFoundElementError("find_article_date exception") return "0000-00-00 00:00:00" def find_article_profileurl(self): profile_area = self.section_activity.find('div', class_='_profileArea pf') # check a>img if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'): return profile_area.a.img.get('src') # this is not essential, so no exception occur else: return '' def find_article_data(self): """ :return: trimmed article_data """ content = self.section_activity.find('div', class_='txt_wrap') if content and content.text: # trim return content.text.strip().replace('\xa0', '\n') # if there is no content or text, return empty data else: return '' def find_article_title(self): # strong.tit_channel is title of channel # if strong.tit_channel do not exist, # title is first line of article_data # this definition is determined by me -_- # find_article_data return trimmed string strong = self.section_activity.find('strong', class_='tit_channel') if strong and strong.text: return strong.text.replace('\xa0', '') article_data = self.find_article_data() if article_data: for line in article_data.splitlines(): # limit title length return line[0:30] if len(line) > 30 else line else: return '' def find_article_etc(self, class_name): """ this function is used for crawling number of shares, replies and feelings :param class_name: :return: a string of number of shares, replies, or feelings """ element = self.section_activity.find('strong', class_=class_name) # check element has text that indicate the number if element and element.text: # It may contain comma ',' to recognize easily # Remove comma ',' to convert from str to int txt = element.text.replace(',', '') return txt # if there is no element or text, return '0' instead of raising exception else: # raise NotFoundElementError('find_article_etc is not Found element with ' + class_name) return '0' def find_article_share(self): return self.find_article_etc('_storyShareCount') def find_article_feeling(self): return self.find_article_etc('_likeCount') def find_article_reply_num(self): return self.find_article_etc('_commentCount') def find_platform_form(self): article_id = self.find_article_id() return 'channel' if article_id.startswith('ch/') else 'story' def find_error(self): error = self.soup.find('div', class_='info_error') if error: return True else: return False def get(self): """ you need to put 'keyword_id' :return: dict for crawled body content """ if not self.section_activity: raise NotFoundElementError("section _activity is not Found") content = dict() content['article_id'] = self.find_article_id() content['article_nickname'] = self.find_article_nickname() content['article_data'] = self.find_article_data() content['article_title'] = self.find_article_title() content['article_date'] = self.find_article_date() content['article_url'] = self.find_article_url() content['article_profileurl'] = self.find_article_profileurl() content['article_order'] = self.find_article_reply_num() content['article_parent'] = self.find_article_share() content['reply_url'] = self.find_article_feeling() content['platform_form'] = self.find_platform_form() content['article_form'] = 'body' content['platform_name'] = 'kakaostory' content['platform_id'] = content['article_id'] content['platform_title'] = content['article_nickname'] return content class ReplyCrawler(object): def __init__(self, driver): self.driver = driver self.soup = None self.section_activity = None self.ul = None self.lis = None def set_soup_and_activity(self): self.soup = BeautifulSoup(self.driver.page_source, parser_opt) # There are many div.section _activity. But a element we use is in div.cover_wrapper # cover_wrapper = self.soup.find('div', class_='cover_wrapper') self.section_activity = self.soup.find('div', class_='section _activity') self.ul = self.section_activity.find('ul', class_='list _listContainer') def load_all_reply(self): previous_num_of_replies = 0 while self.has_more(): self.click_load_more_reply_btn() # check the number of replies before and after click_load_more_reply_btn() # If These were equal, the link or ajax failed current_num_of_replies = self.get_num_of_replies() if previous_num_of_replies == current_num_of_replies: break previous_num_of_replies = current_num_of_replies def get_num_of_replies(self): # Find ul element that contains replies # if raise occur, there is no reply # for performance, this method may is implemented using bs4 try: ul = find_element_by_css_selector(self.driver, #"div.cover_wrapper " "div[class='section _activity'] " "ul[class='list _listContainer']", 5) li = ul.find_elements_by_tag_name('li') return len(li) except Exception as e: return 0 def click_load_more_reply_btn(self): try: # find a link to load reply and click/enter it a = find_element_by_css_selector(self.driver, #"div.cover_wrapper " "div[class='section _activity'] " "a[class='_btnShowMoreComment']", 5) enter_element(a) # no link is in the browser. Nothing happens instead raise exception. But log this event except Exception as e: pass # printl("In click_load_more_reply_btn, there is not a link to load replies") # printl(e) def has_more(self): # In the case that raise exception, # there is no more reply or css selector of the show_more is invalid # These two case can't be classified by exception because the logic is same try: # find show_more element show_more = find_element_by_css_selector(self.driver, # "div.cover_wrapper " "div[class='section _activity'] " "p[class='more _showMoreCommentContainer']", 5) # 'display:block;' -> display the button, 'display:none;' -> hide the button if 'block' in show_more.get_attribute('style'): return True else: return False # return False in the two case # First case is that loading replies is finished # Second case is that css selector to find element is invalid except Exception as e: return False # find_xxxx functions def find_article_id(self): # Find name placeholder divs = self.ul.find_all('a', class_='name _namePlaceholder') # Get article_ids and remove kakaostory url in article_id article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '') for div in divs if div.attrs.get('href', '')] # Refine hrefs. Href may start with '/' article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids) # Return list because of unification of types return list(article_id) def find_article_nickname(self): divs = self.ul.find_all('a', class_='name _namePlaceholder') # If div.text exist, return div.text. Otherwise return empty string return [div.text if div.text else '' for div in divs] def find_article_data(self): divs = self.ul.find_all('div', class_='txt') # The div.text has meta-data in div.p.text. If meta-data exists, remove it # When element does not exists, return empty string return [div.text[len(div.p.text):].replace('\xa0', '\n') if div.p else div.text if div.text else '' for div in divs] def find_article_date(self): divs = self.ul.find_all('span', class_='time') return list(map(get_date, divs)) def find_article_like(self): spans = self.ul.find_all('span', class_='like_num _likeCommentCount') # The number of like exists in span.like_num _likeCommentCount Unless it is present return [span.text if span.text else '' for span in spans] def find_article_profileurl(self): divs = self.ul.find_all('div', class_='pf') return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs)) def get(self): """ Need to put platform_title, platform_id, platform_form from body :return: a list of replies. Need to put platform_title, platform_id """ # load all replies self.load_all_reply() # After loading all replies, crawl replies using BeautifulSoup self.set_soup_and_activity() article_ids = self.find_article_id() article_nicknames = self.find_article_nickname() article_datas = self.find_article_data() article_dates = self.find_article_date() article_profileurls = self.find_article_profileurl() article_likes = self.find_article_like() article_url = self.driver.current_url replies = [] # This may occur exception when indices of each elements is not matched # This exception described above is intended for i in range(len(article_ids)): reply = dict() reply['article_id'] = article_ids[i] reply['article_nickname'] = article_nicknames[i] reply['article_data'] = article_datas[i] reply['article_date'] = article_dates[i] reply['article_profileurl'] = article_profileurls[i] reply['reply_url'] = article_likes[i] reply['platform_name'] = 'kakaostory' reply['article_form'] = 'reply' reply['article_url'] = article_url reply['article_order'] = str(i) replies.append(reply) return replies class EffectKakaostory(object): def __init__(self, event_num, event_code, url, driver): self.event_num = event_num self.event_code = event_code self.url = url self.driver = driver def start(self): try: self.driver.get(self.url) wait(3) body_crawler = BodyCrawler(self.driver) reply_crawler = ReplyCrawler(self.driver) except Exception as e: raise effect.effecterror.OutDatedCrawler(str(e)) try: error = body_crawler.find_error() except Exception as e: raise effect.effecterror.OutDatedCrawler(str(e)) if error: raise effect.effecterror.DeletedUrlError("The URL is Deleted") try: body = body_crawler.get() replies = reply_crawler.get() except Exception as e: raise effect.effecterror.OutDatedCrawler(str(e)) if not body.get('article_id', ''): raise effect.effecterror.OutDatedCrawler("NoData Crawled") try: result = self.statistics(body, replies) result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") result['status'] = 'OK' except Exception as e: raise effect.effecterror.UnknownError(str(e)) #pprint.pprint(body) #pprint.pprint(replies) #pprint.pprint(result) try: cg = get_settings() except Exception as e: raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error') try: result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender() result_sender.connect() result_sender.send('stats_s1_effect', result) result_sender.close() except Exception as e: raise effect.effecterror.DBQueryError(str(e)) def statistics(self, body, replies): result = {} result['viewcount'] = int(body.get('article_hit', 0)) result['event_num'] = self.event_num result['replycount'] = int(body.get('article_order'), 0) result['likecount'] = int(body.get('reply_url'), 0) result['interactioncount'] = self.get_replycount(body, replies) result['replybuzz'] = self.get_reply_buzz(body, replies) result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0) return result def get_replycount(self, body, replies): set_reply_id = set() for i in replies: set_reply_id.add(i.get('article_id', '')) return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id) def get_reply_buzz(self, body, replies): start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date() end_date = datetime.datetime.now().date() date_dict = dict() while start_date <= end_date: date_dict[start_date.strftime('%Y%m%d')] = 0 start_date = start_date + datetime.timedelta(days=1) for reply in replies: str_reply_date = reply.get('article_date', '1990-01-01 00:00:00') reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y') if reply_date in date_dict: date_dict[reply_date] = date_dict[reply_date] + 1 json_array = [{'date': k, 'value': v} for k, v in date_dict.items()] return json.dumps(json_array, sort_keys=True)