diff --git a/WebBasedCrawler/facebook/facebookcrawlbs.py b/WebBasedCrawler/facebook/facebookcrawlbs.py index 3e02dfb..a94d580 100644 --- a/WebBasedCrawler/facebook/facebookcrawlbs.py +++ b/WebBasedCrawler/facebook/facebookcrawlbs.py @@ -58,8 +58,9 @@ class FacebookInit(CrawlInit): # return trimmed_list def make_url(self): - # return [self.urls[self.platform()] + x + "?fref=ts" for x in self.split_searches()] - return [self.urls[self.platform()] + x for x in self.split_searches()] + return [self.urls[self.platform()] + 'profile.php?id=' + x if x.isnumeric() else x + "?fref=ts" + for x in self.split_searches()] + # return [self.urls[self.platform()] + x for x in self.split_searches()] # urls = list() # for x in self.split_searches(): # url = self.urls[self.platform()] + x + "?fref=ts" @@ -113,11 +114,15 @@ class FacebookBodyCrawler: def find_article_nickname(self, soup): nickname = soup.find('div', class_='fbPhotoContributorName') if not nickname or not nickname.get_text(): - span = soup.find('span', class_='fwb fcg') - if span: - nickname = span.a - else: - nickname = soup.find('a', 'profileLink') + temp_nickname = soup.select_one('span.fwb > a') + if temp_nickname.has_attr('href') and temp_nickname.get('href').find(self.find_article_id(soup)) != -1: + nickname = temp_nickname + if not nickname: + span = soup.find('span', class_='fwb fcg') + if span: + nickname = span.a + if not nickname: + nickname = soup.find('a', 'profileLink') if not nickname: nickname = soup.find('a', class_='_2yug') return nickname.get_text() if nickname else ""