using System; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; using System.IO; using System.Linq; using System.Net; using System.Net.Http; using System.Net.Http.Headers; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace NewsCrawler { public class Crawler { NewsForm m_Listener = null; DateTime m_Today = DateTime.Now; int m_iDartAPIRetry = 10; Regex m_ProfitChange = new Regex("30%.*이상.*(변경|변동)"); public Crawler(NewsForm Listener) { m_Listener = Listener; Test(); } class REQUEST_STATUS { public HttpWebRequest m_HTTPReq = null; public bool m_bInitial = false; public Stopwatch m_Timer = new Stopwatch(); public string m_strCodeName = ""; public string m_strAuthKey = ""; }; void Test() { if(Util.IsDebugging() == false) return; ReadSupplyContract(false, "강스템바이오텍", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170111900149"); ReadRevenue(false, "제일기획", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170126800508"); ReadRevenue(false, "LS산전", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170126800581"); ReadRightsIssue(false, "옴니텔", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170126000525"); ReadPatent(false, "인트론바이오", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170125900080"); ReadProfitChange(false, "세하", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170206800276"); ReadProfitChange(false, "아세아", "http://m.dart.fss.or.kr/viewer/main.st?rcpNo=20170206800894"); //ReadDartAPITest(false, "http://dart.fss.or.kr/api/search.json?end_dt=20170206&sort=date&series=desc&page_set=100"); } void ResponseAsiaE(IAsyncResult result) { REQUEST_STATUS State = (REQUEST_STATUS)result.AsyncState; HttpWebRequest HttpReq = State.m_HTTPReq; bool bInitial = State.m_bInitial; State.m_Timer.Stop(); try { string strServerURL = "http://www.asiae.co.kr"; using(HttpWebResponse response = (HttpWebResponse)HttpReq.GetResponse()) { using(Stream dataStream = response.GetResponseStream()) { using(StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding("EUC-KR"))) { string responseFromServer = WebUtility.HtmlDecode(reader.ReadToEnd()); responseFromServer = responseFromServer.Replace("\"\"", "\""); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(responseFromServer); string strXPath = "//div[contains(@class, 'txtform')]/ul/li"; var lists = doc.DocumentNode.SelectNodes(strXPath); foreach(var item in lists) { string strTitle1 = item.SelectSingleNode(".//a").GetAttributeValue("title", ""); var child = item.SelectSingleNode(".//a").FirstChild; string strTitle2 = ""; if (child != null) strTitle2 = child.InnerText; string strTitle = (strTitle1.Length > strTitle2.Length ? strTitle1 : strTitle2); string strTime = item.SelectSingleNode(".//span").InnerText; string strURL = strServerURL+item.SelectSingleNode(".//a").GetAttributeValue("href", ""); if(Regex.IsMatch(strTime, @"\d+/\d+")==true) { //Console.WriteLine("어제 기사 : " + item.InnerHtml); continue; } m_Listener.InsertItem(strTitle, "", "", DateTime.ParseExact(strTime, "HH:mm", CultureInfo.CurrentCulture), DateTime.Now, strURL, "아시아경제", State.m_Timer.Elapsed.TotalMilliseconds, bInitial); } } } } HttpReq.EndGetResponse(result); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } public void ReadAsiaE(bool bInitial = false) { try { HttpWebRequest HttpReq = WebRequest.Create("http://www.asiae.co.kr/news/sokbo/sokbo_left.htm") as HttpWebRequest; HttpReq.Credentials = CredentialCache.DefaultCredentials; HttpReq.Timeout = 2000; REQUEST_STATUS State = new REQUEST_STATUS(); State.m_HTTPReq = HttpReq; State.m_bInitial = bInitial; State.m_Timer.Start(); HttpReq.BeginGetResponse(new AsyncCallback(ResponseAsiaE), State); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } void ResponseEtoday(IAsyncResult result) { REQUEST_STATUS State = (REQUEST_STATUS)result.AsyncState; HttpWebRequest HttpReq = State.m_HTTPReq; bool bInitial = State.m_bInitial; State.m_Timer.Stop(); try { using(HttpWebResponse response = (HttpWebResponse)HttpReq.GetResponse()) { using(Stream dataStream = response.GetResponseStream()) { using(StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding("utf-8"))) { string responseFromServer = WebUtility.HtmlDecode(reader.ReadToEnd()); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(responseFromServer); string strXPath = "//div[contains(@class, 'flash_tab_lst')]/ul/li"; var lists = doc.DocumentNode.SelectNodes(strXPath); if (lists == null) { return; } foreach (var item in lists) { string strTitle = item.SelectSingleNode(".//a").InnerText.Trim(); string strTime = item.SelectSingleNode(".//span[contains(@class, 'flash_press')]").InnerText.Trim(); string strURL = item.SelectSingleNode(".//a").GetAttributeValue("href", ""); strURL="https://www.etoday.co.kr/news/section/newsview.php?idxno="+Regex.Replace(strURL, @"\D", ""); if(Regex.IsMatch(strTime, @"\d+/\d+")==true) { //Console.WriteLine("어제 기사 : " + item.InnerHtml); continue; } m_Listener.InsertItem(strTitle, "", "", DateTime.ParseExact(strTime, "HH:mm", CultureInfo.CurrentCulture), DateTime.Now, strURL, "이투데이", State.m_Timer.Elapsed.TotalMilliseconds, bInitial); } } } } HttpReq.EndGetResponse(result); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } public void ReadEtoday(bool bInitial = false) { try { HttpWebRequest HttpReq = WebRequest.Create("https://www.etoday.co.kr//news/flashnews/flash_list") as HttpWebRequest; HttpReq.Credentials = CredentialCache.DefaultCredentials; HttpReq.Timeout = 2000; REQUEST_STATUS State = new REQUEST_STATUS(); State.m_HTTPReq = HttpReq; State.m_bInitial = bInitial; State.m_Timer.Start(); HttpReq.BeginGetResponse(new AsyncCallback(ResponseEtoday), State); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } void ResponseEtoday2(IAsyncResult result) { REQUEST_STATUS State = (REQUEST_STATUS)result.AsyncState; HttpWebRequest HttpReq = State.m_HTTPReq; bool bInitial = State.m_bInitial; State.m_Timer.Stop(); try { WebRequest request = WebRequest.Create("http://rss.etoday.co.kr/etoday/etoday_news_all.xml"); request.Credentials=CredentialCache.DefaultCredentials; request.Timeout=2000; int iCDATALen = " PrevTime) break; PrevTime = Time; m_Listener.InsertItem(strTitle, "", "", Time, DateTime.Now, strURL, "이데일리", State.m_Timer.Elapsed.TotalMilliseconds, bInitial); } } } } HttpReq.EndGetResponse(result); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } public void ReadEdaily(bool bInitial = false) { try { HttpWebRequest HttpReq = WebRequest.Create("http://www.edaily.co.kr/news/realtime/realtime_NewsList_1.asp") as HttpWebRequest; HttpReq.Credentials = CredentialCache.DefaultCredentials; HttpReq.Timeout = 2000; REQUEST_STATUS State = new REQUEST_STATUS(); State.m_HTTPReq = HttpReq; State.m_bInitial = bInitial; State.m_Timer.Start(); HttpReq.BeginGetResponse(new AsyncCallback(ResponseEdaily), State); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } void ResponseMoneyToday(IAsyncResult result) { REQUEST_STATUS State = (REQUEST_STATUS)result.AsyncState; HttpWebRequest HttpReq = State.m_HTTPReq; bool bInitial = State.m_bInitial; State.m_Timer.Stop(); try { string strServerURL = "http://news.mt.co.kr/mtview.php?no="; DateTime PrevTime = DateTime.Now; using(HttpWebResponse response = (HttpWebResponse)HttpReq.GetResponse()) { using(Stream dataStream = response.GetResponseStream()) { using(StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding("EUC-KR"))) { string responseFromServer = WebUtility.HtmlDecode(reader.ReadToEnd()); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(responseFromServer); string strXPath = "//div[@id='articleList']//li[@class='bundle']"; var lists = doc.DocumentNode.SelectNodes(strXPath); foreach(var item in lists) { string strTitle = item.SelectSingleNode(".//a").InnerText; string strTime = item.SelectSingleNode(".//span").InnerText; string strID = item.SelectSingleNode(".//a").GetAttributeValue("href", ""); int iStart = strID.IndexOf('\'', 0); iStart = strID.IndexOf('\'', iStart+1); iStart = strID.IndexOf('\'', iStart+1); int iEnd = strID.IndexOf('\'', iStart+1); strID = strID.Substring(iStart+1, iEnd-iStart+1); string strURL = strServerURL+strID; DateTime Time; DateTime.TryParseExact(strTime, "HH:mm", CultureInfo.CurrentCulture, DateTimeStyles.None, out Time); if(Time > PrevTime) break; PrevTime = Time; m_Listener.InsertItem(strTitle, "", "", Time, DateTime.Now, strURL, "머니투데이", State.m_Timer.Elapsed.TotalMilliseconds, bInitial); } } } } HttpReq.EndGetResponse(result); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } public void ReadMoneyToday(bool bInitial = false) { try { HttpWebRequest HttpReq = WebRequest.Create("http://news.mt.co.kr/newsflash/newsflash.html") as HttpWebRequest; HttpReq.Credentials = CredentialCache.DefaultCredentials; HttpReq.Timeout = 2000; REQUEST_STATUS State = new REQUEST_STATUS(); State.m_HTTPReq = HttpReq; State.m_bInitial = bInitial; State.m_Timer.Start(); HttpReq.BeginGetResponse(new AsyncCallback(ResponseMoneyToday), State); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } void ResponseFinancialNewsJson(IAsyncResult result) { REQUEST_STATUS State = (REQUEST_STATUS)result.AsyncState; HttpWebRequest HttpReq = State.m_HTTPReq; bool bInitial = State.m_bInitial; State.m_Timer.Stop(); try { string strServerURL = "http://www.fnnews.com/news/"; using(HttpWebResponse response = (HttpWebResponse)HttpReq.GetResponse()) { using(Stream dataStream = response.GetResponseStream()) { using(StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding("UTF-8"))) { string responseFromServer = WebUtility.HtmlDecode(reader.ReadToEnd()); dynamic jObj = Newtonsoft.Json.JsonConvert.DeserializeObject(responseFromServer); foreach(var data in jObj) { string strTitle = data["title"]; string strTime = data["date"]; DateTime Time; DateTime.TryParseExact(strTime, "yyyy.MM.dd HH:mm", CultureInfo.CurrentCulture, DateTimeStyles.None, out Time); string strCode = data["code"]; string strURL = strServerURL+strCode; if(Time.DayOfYear == m_Today.DayOfYear) m_Listener.InsertItem(strTitle, "", "", Time, DateTime.Now, strURL, "파이낸셜뉴스", State.m_Timer.Elapsed.TotalMilliseconds, bInitial); } } } } HttpReq.EndGetResponse(result); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } public void ReadFinancialNewsJson(bool bInitial = false) { try { HttpWebRequest HttpReq = WebRequest.Create("http://www.fnnews.com/newsflash/today/data?type=json") as HttpWebRequest; HttpReq.Credentials = CredentialCache.DefaultCredentials; HttpReq.Timeout = 2000; REQUEST_STATUS State = new REQUEST_STATUS(); State.m_HTTPReq = HttpReq; State.m_bInitial = bInitial; State.m_Timer.Start(); HttpReq.BeginGetResponse(new AsyncCallback(ResponseFinancialNewsJson), State); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } void ResponseFinancialNews(IAsyncResult result) { REQUEST_STATUS State = (REQUEST_STATUS)result.AsyncState; HttpWebRequest HttpReq = State.m_HTTPReq; bool bInitial = State.m_bInitial; State.m_Timer.Stop(); try { string strServerURL = "http://www.fnnews.com"; DateTime Today = DateTime.Now; using(HttpWebResponse response = (HttpWebResponse)HttpReq.GetResponse()) { using(Stream dataStream = response.GetResponseStream()) { using(StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding("UTF-8"))) { string responseFromServer = WebUtility.HtmlDecode(reader.ReadToEnd()); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(responseFromServer); string strXPath = "//div[@class='categoryList']//li"; var lists = doc.DocumentNode.SelectNodes(strXPath); if (lists == null) { return; } try { foreach(var item in lists) { string strTitle = item.SelectNodes(".//a")[1].InnerText; string strDateTime = item.SelectSingleNode(".//span[@class='category_date']").InnerText; string strURL = strServerURL+item.SelectNodes(".//a")[1].GetAttributeValue("href", ""); DateTime Time; DateTime.TryParseExact(strDateTime, "yyyy.MM.dd HH:mm", CultureInfo.CurrentCulture, DateTimeStyles.None, out Time); if(Time.DayOfYear < m_Today.DayOfYear) break; m_Listener.InsertItem(strTitle, "", "", Time, DateTime.Now, strURL, "파이낸셜뉴스", State.m_Timer.Elapsed.TotalMilliseconds, bInitial); } } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } } } HttpReq.EndGetResponse(result); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } public void ReadFinancialNews(bool bInitial = false) { try { HttpWebRequest HttpReq = WebRequest.Create("http://www.fnnews.com/newsflash") as HttpWebRequest; HttpReq.Credentials = CredentialCache.DefaultCredentials; HttpReq.Timeout = 2000; REQUEST_STATUS State = new REQUEST_STATUS(); State.m_HTTPReq = HttpReq; State.m_bInitial = bInitial; State.m_Timer.Start(); HttpReq.BeginGetResponse(new AsyncCallback(ResponseFinancialNews), State); } catch(Exception ex) { Console.WriteLine(ex.Message + Environment.NewLine + ex.StackTrace); } } } }