웹 크롤링
웹 크롤링 과정
맥도날드 홈페이지의 HTML 정보 읽어오는 코드
#include <iostream>
#include <cpprest/http_client.h>
#include <cpprest/filestream.h>
using namespace web::http;
using namespace web::http::client;
using std::wcout;
using std::endl;
void HttpRequest()
{
http_client client(U("<https://www.mcdonalds.co.kr/kor/main.do>"));
http_request req(methods::GET);
auto resp = client.request(req).get();
wcout << resp.extract_string(true).get() << endl;
}
int main()
{
wcout.imbue(std::locale("kor"));
HttpRequest();
return 0;
}
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <cpprest/http_client.h>
#include <cpprest/filestream.h>
#include <cpprest/json.h>
#include <string>
using namespace std;
using namespace utility;
using namespace web;
using namespace web::http;
using namespace web::http::client;
using namespace web::json;
using namespace concurrency::streams;
using namespace utility::conversions;
string url_encode(string origin) {
string ret = ""; //endcoding값 저장용 변수선언및 초기화
for (int index = 0; index < origin.size(); index++) //origin크기만큼 반복
{
//문자가 ascii 코드일경우
//ascii code는 0~127이므로 0x80과 and연산시 무조건 false값을 반환함
//즉 &0x80연산은 해당 data가 ascii코드인지 아닌지를 판단하기 위해 연산 수행
if (!(origin[index] & 0x80)) {
ret += origin[index]; //origin[index]가 ASCII코드일 경우 ret에 저장
continue;
} // ascii code
//문자가 ascii 코드가 아닐경우
unsigned char el = origin[index];
char temp[4];
sprintf(temp, "%%%2X", el);//버퍼에 ascii 코드 형태의 string으로 변환하여 el저장
ret += temp; // 변환된 문자 ret에 저장
}
return ret; // encoding된 값 반환
}
void HttpRequest() {
wstring search_gu, search_dong;
cout << "어디 날씨가 궁금해? OO구 OO동으로 입력하자.";
wcin >> search_gu >> search_dong;
wstring search = search_gu + U("") + search_dong + U("+") + U("날씨");
wcout << search << endl;
wstring url = U("<https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=>");
url += search;
wcout << url << endl;
string utf_8_url = url_encode(to_utf8string(url));
wstring path;
path.assign(utf_8_url.begin(), utf_8_url.end());
wcout << path << endl;
http_client client(path);
http_request get_req(methods::GET);
auto get_resp = client.request(get_req).get();
cout << get_resp.status_code() << " : sync request" << endl;
auto html = get_resp.extract_string(true).get();
wstring str;
str.assign(html.begin(), html.end());
wcout << str << endl; //html 결과
}
int main() {
wcin.imbue(locale("kor"));
wcout.imbue(locale("kor"));
HttpRequest();
return 0;
}
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <cpprest/http_client.h>
#include <cpprest/filestream.h>
#include <cpprest/json.h>
#include <string>
using namespace std;
using namespace utility;
using namespace web;
using namespace web::http;
using namespace web::http::client;
using namespace web::json;
using namespace concurrency::streams;
using namespace utility::conversions;
string url_encode(string origin) {
string ret = "";
for (int index = 0; index < origin.size(); index++)
{
if (!(origin[index] & 0x80)) {
ret += origin[index];
continue;
} // ascii code
unsigned char el = origin[index];
char temp[4];
sprintf(temp, "%%%2X", el);
ret += temp;
}
return ret;
}
void HttpRequest()
{
//url 인코딩
wstring search_gu, search_dong;
cout << "어디 날씨가 궁금해? OO구 OO동으로 입력하자.";
wcin >> search_gu >> search_dong;
wstring search = search_gu + U("+") + search_dong + U("+") + U("날씨");
wcout << search << endl;
wstring url = U("<https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=>");
url += search;
wcout << url << endl;
string utf_8_url = url_encode(to_utf8string(url));
wstring path;
path.assign(utf_8_url.begin(), utf_8_url.end());
wcout << path << endl;
//크롤링
http_client client(path);
http_request get_req(methods::GET);
auto get_resp = client.request(get_req).get();
//cout << get_resp.status_code() << " : sync request" << endl;
auto html = get_resp.extract_string(true).get();
//wcout << html << endl;
//파싱
wstring find_st = U("현재 온도</span>");
wstring find_en = U("<span class=\\"celsius\\">");
auto index_st = html.find(find_st);
auto index_en = html.find(find_en);
string::npos;
wstring celsius;
celsius.assign(html.begin() + index_st + find_st.size(), html.begin() + index_en);
wcout << U("현재 온도 : ") << celsius << endl;
}
int main() {
wcin.imbue(locale("kor"));
wcout.imbue(locale("kor"));
HttpRequest();
return 0;
}
해당 코드 알고리즘
Server
Client • Server에게 요청하는 프로그램 또는 장치
API
REST API