载入中,请稍候……

VC++中网页内容提取范例:提取企业黄页信息

Admin 于 2008-09-20 16:52:22 发表C/C++

订阅: http://www.miniboke.com/Feed/Article_35.aspx
引用: http://www.miniboke.com/Trackback/lRGpenRiukTdeSCduMyr.aspx (UTF-8)
VC实现文件拖拽 < VC++中网页内容提取范例:提取企业黄页信息 > VC系统托盘编程

功能是在输入一个城市或地区名称后,能提取所有的该地区的企业信息
在刚刚总算完成了个大概,用了一天多点的时间,主要代码如下

  1. void CCompanyInformationView::OnCollectInformation() 
  2.     CString strURL="http://www.hengzhe.com/Company/gs.html?q="
  3.     CWTString wts; 
  4.     CString str ; 
  5.     CString LinkSavePath="E:\\LinkSave.txt";    
  6.     CStdioFile myFile; 
  7.     myFile.Open(LinkSavePath,CFile::modeCreate|CFile::modeReadWrite); 
  8.     myFile.Close(); 
  9.     m_bar.GetDlgItemText(idcArea,str); 
  10.     strURL+=wts.URLEncode(wts.GB2312ToUTF8(str)); 
  11.     strURL+="&ar=%e5%85%a8%e5%9b%bd"
  12.     LoopPageLink(strURL); 
  13.     
  14. //-----递归取页数 
  15. void CCompanyInformationView::LoopPageLink(CString strURL) 
  16.     CString strInitURL=strURL; 
  17.     CWTString wts; 
  18.     CString str ; 
  19.     GetPageLink(strURL); 
  20.  
  21.     if(ant.Request("GET",strInitURL)<1){ 
  22.         CString strError = ant.GetError(); 
  23.         return ; 
  24.     } 
  25.     str=wts.UTF8ToGB2312(ant.m_strBody); 
  26.     WriteFileText("E:\\Company.htm",str); 
  27.     
  28.     reader.SetHtml(str); 
  29.     LPCTSTR lpTag; 
  30.     CString strValue,strName,strInside; 
  31.     
  32.     /*for(;;) 
  33.     { 
  34.         
  35.         lpTag=reader.GetNextTag(); 
  36.         strName=reader.GetTagName(lpTag); 
  37.         if(strName!="a")  continue; 
  38.         strInside=reader.GetInnerHtml(lpTag); 
  39.         strValue=reader.GetTagAttribute(lpTag,"href"); 
  40.         if(strValue.Find("gs.html?")<0)  continue; 
  41.         str="http://www.hengzhe.com/Company/"+strValue; 
  42.         GetPageLink(str); 
  43.         break; 
  44.     }*/ 
  45.     for(;;) 
  46.     { 
  47.         lpTag=reader.GetNextTag(); 
  48.         strName=reader.GetTagName(lpTag); 
  49.         if(strName!="a")  continue
  50.         if(strName.Compare("/html")==0)  break
  51.         strInside=reader.GetInnerHtml(lpTag); 
  52.         if(strInside.Compare("下一页")!=0) continue
  53.         strValue=reader.GetTagAttribute(lpTag,"href"); 
  54.         strValue="http://www.hengzhe.com/Company/"+strValue; 
  55.         LoopPageLink(strValue); 
  56.         break
  57.     } 
  58.       
  59.  
  60.  
  61.  
  62. //----------------------------------取一个页面链接的所有公司 
  63. void CCompanyInformationView::GetPageLink(CString strURL) 
  64.     CWTString wts; 
  65.     CString str ; 
  66.     WriteFileText("E:\\kankan.txt",strURL); 
  67.     if(ant.Request("GET",strURL)<1){ 
  68.         CString strError = ant.GetError(); 
  69.         return ; 
  70.     } 
  71.     str=wts.UTF8ToGB2312(ant.m_strBody); 
  72.     CString LinkSavePath="E:\\LinkSave.txt";    
  73.     CStdioFile myFile; 
  74.     myFile.Open(LinkSavePath,CFile::modeReadWrite); 
  75.     reader.SetHtml(str); 
  76.     LPCTSTR lpTag; 
  77.     CString strValue,strName,strInside; 
  78.     for(;;) 
  79.     { 
  80.         
  81.         lpTag=reader.GetNextTag(); 
  82.         strName=reader.GetTagName(lpTag); 
  83.         if(strName.Compare("/html")==0) break
  84.         if(strName!="a")  continue
  85.         strValue=reader.GetTagAttribute(lpTag,"href"); 
  86.         if(strValue.Find("vi.html?")<0) continue
  87.         str="http://www.hengzhe.com/Company/"+strValue; 
  88.         myFile.WriteString("\r\n"); 
  89.         myFile.WriteString(str); 
  90.             
  91.     } 
  92.     myFile.Close(); 
  93.  
  94.     SaveData(); 
  95.  
  96. //---------------------------------- 
  97. void CCompanyInformationView::SaveData()//数据库操作将某个企业基本信息保存到数据库 
  98.         
  99.     CWTString wts; 
  100.     CString str; 
  101.     CSQLInsert sql; 
  102.     CString strURL; 
  103.     CStdioFile myFile; 
  104.     myFile.Open("E:\\LinkSave.txt",CFile::modeRead); 
  105.     myFile.ReadString(strURL); 
  106.     for(;;) 
  107.     { 
  108.         myFile.ReadString(strURL); 
  109.         if (strURL.IsEmpty())  break
  110.  
  111.         if(ant.Request("GET",strURL)<1){ 
  112.             CString strError = ant.GetError(); 
  113.             return ; 
  114.         } 
  115.         sql.SetTableName("CompanyInformation");    
  116.         int index=dbGetLastKeyId(m_pDB,"CompanyInformation")+1; 
  117.         sql.SetFieldLong("KeyId",index); 
  118.  
  119.         LPCTSTR lpTag; 
  120.         CString strValue,strName,strInside; 
  121.         int at1,at2; 
  122.         reader.SetHtml(wts.UTF8ToGB2312(ant.m_strBody)); 
  123.         for(;;) 
  124.         {        
  125.             lpTag=reader.GetNextTag(); 
  126.             strName=reader.GetTagName(lpTag); 
  127.             if(strName.Compare("/body")==0)  break
  128.             if(strName.Compare("td")!=0)  continue
  129.             strValue=reader.GetTagAttribute(lpTag,"width"); 
  130.             if(strValue.Compare("70%")==0)  continue
  131.             strInside=reader.GetInnerHtml(lpTag); 
  132.             if(strInside.Find("公司名称:")>=0) 
  133.             { 
  134.                 strInside.TrimRight(); 
  135.  
  136.                 at1=strInside.Find(":"); 
  137.                 str=strInside.Mid(at1+2,strInside.GetLength()-at1+3); 
  138.                 str.TrimLeft(); 
  139.                 //str.TrimLeft(":"); 
  140.                 sql.SetFieldText("Customer",str); 
  141.             } 
  142.             else if(strInside.Find("所在区域:")>=0) 
  143.             { 
  144.                 strInside.TrimRight(); 
  145.                 if(strInside.GetLength()<=5)  continue
  146.                 at1=strInside.Find(":"); 
  147.                 at2=strInside.Find("."); 
  148.                 str=strInside.Mid(at1+2,at2-at1-2); 
  149.                 sql.SetFieldText("Province",str); 
  150.                 str=strInside.Mid(at2+1,strInside.GetLength()-at2+1); 
  151.                 str.TrimLeft(); 
  152.                 sql.SetFieldText("City",str); 
  153.             } 
  154.             else if(strInside.Find("联 系 人:")>=0) 
  155.             { 
  156.                 strInside.TrimRight(); 
  157.                 if(strInside.GetLength()<=6)  continue
  158.                 at1=strInside.Find(":"); 
  159.                 str=strInside.Mid(at1+2,strInside.GetLength()-at1); 
  160.                 str.TrimLeft(); 
  161.                 sql.SetFieldText("Contact",str); 
  162.             } 
  163.             else if(strInside.Find("详细信息:")>=0) 
  164.             {            
  165.                 lpTag=reader.GetNextTag(); 
  166.                 lpTag=reader.GetNextTag(); 
  167.                 strInside=reader.GetInnerHtml(lpTag); 
  168.                 strInside.TrimRight(); 
  169.                 str.TrimLeft(); 
  170.                 sql.SetFieldText("Abstract",strInside); 
  171.             } 
  172.             else if(strInside.Find("联系电话:")>=0) 
  173.             { 
  174.                 strInside.TrimRight(); 
  175.                 if(strInside.GetLength()<=10)  continue
  176.                 at1=strInside.Find("-"); 
  177.                 at2=strInside.Find("-",at1+1); 
  178.                 str=strInside.Mid(at1+1,at2-at1-1); 
  179.                 if(!str.IsEmpty())  sql.SetFieldText("Area",str); 
  180.                 str=strInside.Mid(at2+1,strInside.GetLength()-at2); 
  181.                 if(!str.IsEmpty())  sql.SetFieldText("Phone",str); 
  182.             } 
  183.             else if(strInside.Find("传 真:")>=0) 
  184.             { 
  185.                 strInside.TrimRight(); 
  186.                 if(strInside.Find("-")>=0) 
  187.                 { 
  188.                     at1=strInside.Find("-"); 
  189.                     at2=strInside.Find("-",at1+1); 
  190.                     str=strInside.Mid(at2+1,strInside.GetLength()-at2); 
  191.                     str.TrimLeft(); 
  192.                     sql.SetFieldText("Fax",str); 
  193.                 } 
  194.             } 
  195.             else if(strInside.Find("移动电话:")>=0) 
  196.             { 
  197.                 strInside.TrimRight(); 
  198.                 at1=strInside.Find(":"); 
  199.                 str=strInside.Mid(at1+2,strInside.GetLength()-at1); 
  200.                 str.TrimLeft(); 
  201.                 at1=str.Find(">"); 
  202.                 str=str.Mid(at1+1,str.GetLength()-at1); 
  203.                 if(str.IsEmpty()) continue
  204.                 sql.SetFieldText("Mobile",str); 
  205.             } 
  206.             else if(strInside.Find("电子邮箱:")>=0) 
  207.             {        
  208.                 strInside.TrimRight(); 
  209.                 int ii=strInside.GetLength(); 
  210.                 if(strInside.GetLength()<=10)  continue
  211.                 at1=strInside.Find(":"); 
  212.                 str=strInside.Mid(at1+1,strInside.GetLength()-at1); 
  213.                 str.TrimLeft(); 
  214.                 sql.SetFieldText("Email",str); 
  215.             } 
  216.             else if(strInside.Find("公司地址:")>=0) 
  217.             { 
  218.                 strInside.TrimRight(); 
  219.                 at1=strInside.Find(":"); 
  220.                 str=strInside.Mid(at1+2,strInside.GetLength()-at1); 
  221.                 str.TrimLeft(); 
  222.                 sql.SetFieldText("Address",str); 
  223.             } 
  224.             else if(strInside.Find("公司网址:")>=0) 
  225.             { 
  226.                 strInside.TrimRight(); 
  227.                 if(strInside.Find("www")<0&&strInside.Find("http")<0) continue
  228.                 lpTag=reader.GetNextTag(); 
  229.                 strValue=reader.GetTagAttribute(lpTag,"href"); 
  230.                 sql.SetFieldText("Homepage",strValue); 
  231.             }        
  232.         } 
  233.         CString strSQL=sql.GetSQL(); 
  234.         
  235.         if(m_pDB->Execute(strSQL)<1) 
  236.         { 
  237.             MessageBox(m_pDB->m_strError+"\r\n"+strSQL); 
  238.             return
  239.         } 
  240.         sql.DeleteContents(); 
  241.         m_qdbgrid.Retrieve(m_pDB); 
  242.         m_grid.Invalidate(); 
  243.     } 

 

被阅714次, 0投一票
  • 看完了要说点啥么?
  • 昵称 (不填说不了话)
  • 信箱地址 (不会被公开,但是不填也说不了话)
  • 网址 (这个不填也成)

Powered by MiniBoke v2.0.0.8 Build 0828

Copyright © 2008 迷你博客. All rights reserved.

粤ICP备07500939号