当前位置：文档之家› C++或C抓取网页内容

C++或C抓取网页内容

C++或C抓取网页内容
2009-03-04 18:30
Windows VC下的做法：

#include
#include

int main(int argc, char* argv[])
{
CInternetSession session("HttpClient");
char * url = " https://www.doczj.com/doc/e06059109.html,/simcard.php?simcard=1392658";
CHttpFile* pfile = (CHttpFile *)session.OpenURL(url);

DWORD dwStatusCode;
pfile -> QueryInfoStatusCode(dwStatusCode);
if(dwStatusCode == HTTP_STATUS_OK)
{
CString content;
CString data;
while (pfile -> ReadString(data))
{
content += data + "\r\n";
}
content.TrimRight();
printf(" %s\n " ,(LPCTSTR)content);
}
pfile -> Close();
delete pfile;
session.Close();

return 0 ;
}

Windows下用socket：
#include
#include
#include
#include "winsock2.h"
#include

#pragma comment(lib, "ws2_32.lib")

using namespace std;

#define DEFAULT_PAGE_BUF_SIZE 1048576

void main()
{
WSADATA wsaData;
int err;
err = WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0 )
{
return;
}

// timer is start

clock_t start, finish;
double duration;
start = clock();

char host[] = "https://www.doczj.com/doc/e06059109.html,";
char *request = "GET / HTTP/1.0\r\nHost: https://www.doczj.com/doc/e06059109.html,\r\nConnection: Close\r\n\r\n";

struct hostent *hp;
hp = gethostbyname(host);
if(hp == NULL)
{
cout << "gethostbyname() error in GetIpByHost: " << host << endl;
return;
}

// 获取域名对应的IP

struct in_addr inAddr;
LPSTR lpAddr;
lpAddr = hp->h_addr;
memmove(&inAddr,lpAddr,4);

int sock, ret = 0, optval = 1;
struct sockaddr_in sa;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr));

sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock == -1)
{
return;
}
if(sock == -2)
{
return;
}

// send the "GET" data

ret = send(sock, request, strlen(request), 0);

// 网页内容长度。可以从http头部数据中获取 "Content-Length:"

int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;

char *pageBuf;
pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);

int bytesRead = 0;
while(ret > 0)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);

if(ret > 0)
{
bytesRead += ret;
}
}
pageBuf[bytesRead] = '\0';

cout << bytesRead << endl;

// write the html content to the file

ofstream ofs;
ofs.open("ofs.txt");
ofs << pageBuf << endl;

ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup();

// timer is finish

finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << "have cost " << duration << " seconds\n";

return;
}

其他如不从缓

存中读取内容及如何使用代理连接现在就不说了，可以参考下面的链接，或者下次补上。另外不妨看看 Java 是如何读取 URL 内容的，更简单

GetMethod httpMethod = new GetMethod("https://www.doczj.com/doc/e06059109.html,");
int statusCode = new HttpClient().executeMethod(httpMethod);
if(statusCode == HttpStatus.SC_OK)
{
System.out.println(httpMethod.getResponseBodyAsString());
}
httpMethod.releaseConnection();
GetMethod httpMethod = new GetMethod("https://www.doczj.com/doc/e06059109.html,"); int statusCode = new HttpClient().executeMethod(httpMethod); if(statusCode == HttpStatus.SC_OK) { System.out.println(httpMethod.getResponseBodyAsString()); } httpMethod.releaseConnection();
内容取过来之后，总是希望从中拣出需要的数据，可惜 VC6 中没有自己的正则表达式库，所以下一步要学用 boost 的正则表达式库。

Linux 下最简单，三种办法：

如果在C程序想抓取网页内容，比如百度
方法1. 执行System("wget http：//https://www.doczj.com/doc/e06059109.html, -q -O baidu.html")，然后再分析baidu.html。
方法2. system(curl https://www.doczj.com/doc/e06059109.html,)
方法3. 直接写socket程序获取https://www.doczj.com/doc/e06059109.html,的内容存入buffer中。