当前位置:文档之家› 网页提取源代码

网页提取源代码

#include
#include
#include
#include
#include"stdio.h"
#include"string.h"
typedef char* LPTSTR;
typedef unsigned int UINT;
#define MAXBLOCKSIZE 1024
#pragma comment (lib, "wininet.lib")

void download(const char *Url)
{
HINTERNET hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hSession != NULL)
{
HINTERNET handle2 = InternetOpenUrl(hSession, Url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
if (handle2 != NULL)
{
cout<byte Temp[MAXBLOCKSIZE];
ULONG Number = 1;
ofstream ofs("download.txt");
if(ofs)
{
while (Number > 0)
{
InternetReadFile(handle2, Temp, MAXBLOCKSIZE - 1, &Number);
ofs<}
ofs.close();
}
InternetCloseHandle(handle2);
handle2 = NULL;
}
InternetCloseHandle(hSession);
hSession = NULL;
}
}

int statistics(LPTSTR pszBuffer) //指针应该是多大的数组的函数;
{
LPTSTR pszSource=pszBuffer;
int i=100000;
while(*(pszSource+i-1)==*(pszSource+i-2))
{
i--;
}
return i;
}

bool TrimString(LPTSTR pszBuffer,UINT &w,UINT &k,bool chinese)
{
LPTSTR pszSource=pszBuffer;
LPTSTR pszDest=pszBuffer;
LPTSTR pszTemp=pszBuffer;
bool ch=false;
bool mark=false;
while (*pszSource!='\0')
{
while(*pszSource=='&') //删除html文件中的空格标记: 
{ *pszSource='\0';
pszSource+=6;
}
while(64<(*pszSource)&&(*pszSource)<123)
{
pszSource++; //删去文件中的英文,使得原来包含英文的语句成为一些标点符号聚集的地方;
while(*pszSource=='&')
pszSource+=6;
}
if(!ch&&(*pszSource)<0) //本段字符中是否含有中文字符
ch=true;
if(*pszSource=='{')
k++;
if(k==0) //如果未被包含在{}中
{
if(w!=0) //如果包含在<>中
{
if(*pszSource=='>')
w--;
else if(*pszSource=='<')
{
w++;
}
}
else
{ //未包含在<>中
if(*pszSource=='<')
{
w++;
mark=true;
}
else
{
if(mark) //每段文字以空格分开
{
*pszDest=' ';
pszDest++;
mark=false;
}
*pszDest=*pszSource;
pszDest++;
}
}
}
if(*pszSource=='}')
k--;
pszSource++;
} //结束处理
if(chinese)
{
if(ch)
{
*pszDest='\0';
}
else
{
*pszDest='\0';
}
}
else
{
*pszDest='\0';
}
return 1;
}


int main(i

nt argc, char* argv[])
{
download("https://www.doczj.com/doc/4317079527.html,");
/* if(argc > 1)
{
download((const char*)argv[1]);

}
else
{
cout<<"Usage: auto-Update url";
}
*/
FILE *fp=fopen("download.txt","rb");
char *buf=new char[100000];
fread(buf,1,100000,fp);
int number;
number=statistics(buf); //统计html文件中的字符数;
printf("%d\n",number); //输出文件所占的字符数;
fclose(fp);

FILE *fh=fopen("download.txt","rb");
char *duf=new char[number]; //定义字符所占的是适合数组;
fread(duf,1,number,fh);
UINT w,h;
w=h=0;
TrimString(duf,w,h,1); //读取html文件到duf指针;

FILE *fp1=fopen("aa.txt","wb");
fwrite(duf,1,strlen(duf)-10,fp1); //将duf指针的内容写入txt文件中;
fclose(fh);
fclose(fp1);
return 0;
}

相关主题
文本预览
相关文档 最新文档