如何java写/实现网络爬虫抓取网页 用java编写网络爬虫,用来爬网络音乐资源,再返回java页...

\u5982\u4f55java\u5199/\u5b9e\u73b0\u7f51\u7edc\u722c\u866b\u6293\u53d6\u7f51\u9875

\u7f51\u7edc\u722c\u866b\u662f\u4e00\u4e2a\u81ea\u52a8\u63d0\u53d6\u7f51\u9875\u7684\u7a0b\u5e8f\uff0c\u5b83\u4e3a\u641c\u7d22\u5f15\u64ce\u4ece\u4e07\u7ef4\u7f51\u4e0a\u4e0b\u8f7d\u7f51\u9875\uff0c\u662f\u641c\u7d22\u5f15\u64ce\u7684\u91cd\u8981\u7ec4\u6210\u3002\u4f20\u7edf\u722c\u866b\u4ece\u4e00\u4e2a\u6216\u82e5\u5e72\u521d\u59cb\u7f51\u9875\u7684URL\u5f00\u59cb\uff0c\u83b7\u5f97\u521d\u59cb\u7f51\u9875\u4e0a\u7684URL\uff0c\u5728\u6293\u53d6\u7f51\u9875\u7684\u8fc7\u7a0b\u4e2d\uff0c\u4e0d\u65ad\u4ece\u5f53\u524d\u9875\u9762\u4e0a\u62bd\u53d6\u65b0\u7684URL\u653e\u5165\u961f\u5217\uff0c\u76f4\u5230\u6ee1\u8db3\u7cfb\u7edf\u7684\u4e00\u5b9a\u505c\u6b62\u6761\u4ef6\u3002
java\u5b9e\u73b0\u7f51\u9875\u6e90\u7801\u83b7\u53d6\u7684\u6b65\u9aa4\uff1a
(1)\u65b0\u5efaURL\u5bf9\u8c61\uff0c\u8868\u793a\u8981\u8bbf\u95ee\u7684\u7f51\u5740\u3002\u5982\uff1aurl=new URL("http://www.sina.com.cn");
(2)\u5efa\u7acbHTTP\u8fde\u63a5\uff0c\u8fd4\u56de\u8fde\u63a5\u5bf9\u8c61urlConnection\u5bf9\u8c61\u3002\u5982\uff1aurlConnection = (HttpURLConnection)url.openConnection();
(3)\u83b7\u53d6\u76f8\u5e94HTTP \u72b6\u6001\u7801\u3002\u5982responsecode=urlConnection.getResponseCode();
(4)\u5982\u679cHTTP \u72b6\u6001\u7801\u4e3a200\uff0c\u8868\u793a\u6210\u529f\u3002\u4eceurlConnection\u5bf9\u8c61\u83b7\u53d6\u8f93\u5165\u6d41\u5bf9\u8c61\u6765\u83b7\u53d6\u8bf7\u6c42\u7684\u7f51\u9875\u6e90\u4ee3\u7801\u3002

\u4e0b\u9762\u662f\u6e90\u4ee3\u7801,\u5e0c\u671b\u53ef\u4ee5\u5e2e\u5230\u4f60~~
package com.ly.mainprocess;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.StatusLine;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

public class Test1 {
public static void main(String[] args){
Test1 test1 = new Test1();
System.out.println(test1.process("******","******"));
}

@SuppressWarnings("deprecation")
public boolean process(String username,String password) {
boolean ret=false;
DefaultHttpClient httpclient = new DefaultHttpClient();
try {
HttpGet httpget;
HttpResponse response;
HttpEntity entity;

List cookies;

//\u7ec4\u5efa\u767b\u5f55\u7684post\u5305
HttpPost httppost = new HttpPost("http://login.hi.mop.com/Login.do"); // \u7528\u6237\u767b\u5f55
List nvps = new ArrayList();
nvps.add(new BasicNameValuePair("nickname", username));
nvps.add(new BasicNameValuePair("password", password));
nvps.add(new BasicNameValuePair("origURL", "http://hi.mop.com/SysHome.do"));
nvps.add(new BasicNameValuePair("loginregFrom", "index"));
nvps.add(new BasicNameValuePair("ss", "10101"));

httppost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8));
httppost.addHeader("Referer", "http://hi.mop.com/SysHome.do");
httppost.addHeader("Connection", "keep-alive");
httppost.addHeader("Content-Type", "application/x-www-form-urlencoded");
httppost.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
httppost.addHeader("Origin", "http://hi.mop.com");
httppost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
response = httpclient.execute(httppost);
entity = response.getEntity();
//System.out.println("Login form get: " + response.getStatusLine());
EntityUtils.consume(entity);

//System.out.println("Post logon cookies:");
cookies = httpclient.getCookieStore().getCookies();
if (cookies.isEmpty()) {
//System.out.println("None");
} else {
for (int i = 0; i < cookies.size(); i++) {
//System.out.println("- " + cookies.get(i).toString());
}
}

//\u8fdb\u884c\u9875\u9762\u8df3\u8f6c
String url = ""; // \u9875\u9762\u8df3\u8f6c
Header locationHeader = response.getFirstHeader("Location");
//System.out.println(locationHeader.getValue());
if (locationHeader != null) {
url = locationHeader.getValue(); // \u5f97\u5230\u8df3\u8f6chref
HttpGet httpget1 = new HttpGet(url);
response = httpclient.execute(httpget1);
// \u767b\u9646\u6210\u529f\u3002\u3002\u3002hoho
}
entity = response.getEntity();
//System.out.println(response.getStatusLine());
if (entity != null) {
//System.out.println("Response content length: " + entity.getContentLength());
}
// \u663e\u793a\u7ed3\u679c
BufferedReader reader = new BufferedReader(new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
while ((line = reader.readLine()) != null) {
//System.out.println(line);
}

//\u81ea\u52a8\u6253\u5361
// \u8bbf\u95ee\u7f51\u7ad9\u7684\u5b50\u7f51\u9875\u3002
HttpPost httppost1 = new HttpPost("http://home.hi.mop.com/ajaxGetContinusLoginAward.do"); // \u8bbe\u7f6e\u4e2a\u4eba\u4fe1\u606f\u9875\u9762
httppost1.addHeader("Content-Type", "text/plain;charset=UTF-8");
httppost1.addHeader("Accept", "text/plain, */*");
httppost1.addHeader("X-Requested-With", "XMLHttpRequest");
httppost1.addHeader("Referer", "http://home.hi.mop.com/Home.do");
response = httpclient.execute(httppost1);
entity = response.getEntity();
//System.out.println(response.getStatusLine());
if(response.getStatusLine().toString().indexOf("HTTP/1.1 200 OK")>=0){
ret = true;
}
if (entity != null) {
//System.out.println("Response content length: " + entity.getContentLength());
}
// \u663e\u793a\u7ed3\u679c
reader = new BufferedReader(new InputStreamReader(entity.getContent(), "UTF-8"));
line = null;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
} catch (Exception e) {

} finally {
httpclient.getConnectionManager().shutdown();
}
return ret;
}
}

原理即是保存cookie数据保存登陆后的cookie.以后每次抓取页面把cookie在头部信息里面发送过去。系统是根据cookie来判断用户的。有了cookie就有了登录状态,以后的访问都是基于这个cookie对应的用户的。补充:Java是一种可以撰写跨平台应用软件的面向对象的程序设计语言。Java技术具有卓越的通用性、高效性、平台移植性和安全性,广泛应用于PC、数据中心、游戏控制台、科学超级计算机、移动电话和互联网,同时拥有全球最大的开发者专业社群。

  • 濡備綍鍐欏嚭濂界殑Java浠g爜
    绛旓細41銆佸浜庨偅浜涒濆彇寰楁垨鏀瑰彉Private鏁版嵁鍊尖濈殑鍑芥暟,璇蜂娇鐢↗ava Beans 鐨勨漡et鈥濄佲漵et鈥濄佲漣s鈥濈瓑鍛藉悕涔犳儻,鍗充娇浣犲綋鏃朵笉璁や负鑷繁姝e湪鎾板啓Java Bean銆傝繖涔堝仛涓嶄粎鍙互杞绘槗浠ean鐨勮繍鐢ㄦ柟寮忔潵杩愮敤浣犵殑class,涔熸槸瀵规绫诲嚱鏁扮殑涓绉嶆爣鍑嗗懡鍚嶆柟寮,浣胯鑰呮洿鏄撲簬鐞嗚В銆42銆佸浜庝綘鎵鎷熷畾鐨勬瘡涓涓猚lass,璇疯冭檻涓哄畠鍔犲叆static pub...
  • java鎬庝箞鍐
    绛旓細1銆 棣栧厛锛屽湪D鐩樹笅寤虹珛浠绘剰寤虹珛涓涓洰褰曪紙寤鸿鏄潪涓枃鐨勭洰褰曪級锛岃繖閲屾垜寤虹珛鐨勭洰褰曟槸javacode銆傜劧鍚庤繘鍏ヨ鐩綍锛屽湪璇ョ洰褰曚笅寤虹珛涓涓枃浠跺悕鏄細HelloWorld.java鐨鏅氭枃浠躲2銆 浣跨敤鏂囨湰鎵撳紑璇ユ枃浠躲傜劧鍚庤緭鍏ヤ竴涓嬪唴瀹癸紝鍒濆瑕佺壒鍒敞鎰忓崟璇嶇殑澶у皬鍐欏拰姣忎釜鍗曡瘝涔嬮棿閮藉繀椤昏鏈夌┖鏍硷紝杩樺緱娉ㄦ剰澶ф嫭鍙峰拰鍒嗗彿绛夌鍙枫
  • java鎬庝箞鍐
    绛旓細1銆侀鍏堟垜浠墦寮鐢佃剳妗岄潰锛岀偣鍑绘闈㈠乏涓嬭绯荤粺鍥炬爣锛屾墦寮璁颁簨鏈2銆佸啓濂戒唬鐮佷箣鍚庯紝鐐瑰嚮宸︿笂瑙掔殑鏂囦欢銆3銆佺劧鍚庢垜浠啀涓嬫媺鑿滃崟閲岀偣鍑讳繚瀛橀夐」杩涜淇濆瓨銆4銆佽繖鏃跺欐垜浠妸鏂囦欢鍚嶇殑鍚庣紑鏀规垚java鍗冲彲鍒涘缓Java鏂囦欢锛屾敼鎴愬叾浠栧悗缂鍚嶅嵆鍙垱寤哄叾浠栨牸寮忔枃浠躲5銆佹敼濂戒箣鍚庣偣鍑讳繚瀛樺氨鍙互浜嗐
  • 璁颁簨鏈鎬庝箞鍐檍ava浠g爜?
    绛旓細1銆佺敤娴忚鍣ㄦ墦寮鐢ㄨ浜嬫湰缂栧啓鐨勪唬鐮 鏂板缓鈥滄枃鏈枃妗b濆悗锛岄紶鏍囧彸閿偣鍑昏鏂囨湰鏂囨。锛屽湪鑿滃崟鏍忕殑鈥滄墦寮鏂瑰紡鈥濋夋嫨鈥滅敤璁颁簨鏈墦寮鈥濓紝涔熷彲浠ヨ缃粯璁ゆ墦寮鏂瑰紡涓衡滆浜嬫湰鈥濓紱鐢ㄨ浜嬫湰鎵撳紑鏂囨湰鏂囨。鍚庯紝鐩存帴鍦ㄨ鏂囨。鍐呮牴鎹嚜宸辩殑闇瑕佽緭鍏ユ兂瑕佺紪杈戠殑缃戦〉浠g爜銆2銆佽浜嬫湰鍐檍ava浠g爜鎬庝箞杩愯 棣栧厛锛岄渶瑕佸畨瑁卝dk骞堕厤缃...
  • 鑷Java濡備綍鍏ラ棬?
    绛旓細涓銆Java鍩虹 JavaSE鍩虹鏄疛ava涓骇绋嬪簭鍛樼殑璧风偣锛屾槸甯姪浣犱粠灏忕櫧鍒版噦寰楃紪绋嬬殑蹇呯粡涔嬭矾銆傚湪Java鍩虹鏉垮潡涓湁6涓瓙妯″潡鐨勫涔狅細鍩虹璇硶锛屽彲甯姪浣犲缓绔嬪熀鏈殑缂栫▼閫昏緫鎬濈淮锛涢潰鍚戝璞★紝浠ュ璞℃柟寮忓幓缂栧啓浼樼編鐨凧ava绋嬪簭锛涢泦鍚堬紝鍚庢湡寮鍙戜腑瀛樺偍鏁版嵁蹇呭鎶鏈紱IO锛屽纾佺洏鏂囦欢杩涜璇诲彇鍜屽啓鍏ュ熀纭鎿嶄綔锛涘绾跨▼涓庡苟鍙...
  • 鍋java杞欢宸ョ▼甯,鎬庢牱鎵嶈兘鍐欏嚭濂界殑浠g爜?
    绛旓細1.閲嶈娉ㄩ噴 鏈夌殑java绋嬪簭鍛樺湪鍐欎唬鐮佹椂锛屼粠鏉ユ病鏈夋兂杩囪鍦╦ava浠g爜鍚庡姞涓婄浉鍏崇殑娉ㄩ噴锛岀敋鑷虫槸涓婁竾琛岀殑浠g爜涔熸病鏈夋兂杩囧姞涓婃敞閲婏紝杩欏氨瀛樺湪寰堝ぇ鐨勯棶棰橈紝涓嶈浣犵殑浠g爜浼氳窡鍏朵粬浜哄垎浜璁猴紝灏变綘鑷繁鍥為【浣犳槸鎬庝箞鍐濂借繖绡囦唬鐮佺殑锛屼綘涔熸槸鍗婂ぉ鏃犳硶鐞嗗嚭澶寸华锛岃繖灏变负缁存姢鍜屼慨鏀圭瓑绛夊伐浣滄坊鍔犱簡寰堝ぇ鐨勯夯鐑︺傛墍浠ワ紝...
  • 濡備綍鍐欏嚭浼樿川Java浠g爜鐨4涓妧宸?
    绛旓細鍦ㄤ互鍓嶄竴绡囨枃绔犱腑锛屾垜浠璁轰簡蹇呰鏃跺彲浠ヤ娇鐢ㄧ殑鍥涚鐗规畩鎶鏈紝杩欎簺鐗规畩鎶鏈彲浠ュ垱寤烘洿濂界殑Java杞欢锛涜屾湰鏂囨垜浠皢浠嬬粛涓浜涙湁鍔╀簬瑙e喅甯歌闂鐨勯氱敤璁捐绛栫暐鍜岀洰鏍囧疄鐜版妧鏈紝鍗筹細1.鍙仛鏈夌洰鐨勬х殑浼樺寲 2.甯搁噺灏介噺浣跨敤鏋氫妇 3.閲嶆柊瀹氫箟绫婚噷闈㈢殑equals()鏂规硶 4.灏介噺澶氫娇鐢ㄥ鎬佹 鍊煎緱娉ㄦ剰鐨勬槸锛屾湰鏂囦腑鎻忚堪鐨...
  • 鎬庝箞缂栧啓 JAVA 浠g爜
    绛旓細涓銆Java缂栫▼鍏ラ棬绫 瀵逛簬娌℃湁Java缂栫▼缁忛獙鐨勭▼搴忓憳瑕佸叆闂紝闅忎究璇讳粈涔堝叆闂ㄤ功绫嶉兘涓鏍凤紝杩欎釜闃舵闇瑕佸揩閫熺殑鎺屾彙Java鍩虹璇硶鍜屽熀鏈敤娉曪紝瀹楁棬灏辨槸鈥滃洬鍥靛悶鏋d笉姹傜敋瑙b濓紝鍏堝Java鐔熸倝璧锋潵鍐嶈銆傜敤寰堢煭鐨勬椂闂村揩閫熻繃涓閬岼ava璇硶锛岃繛鎳靛甫鐚滃鍐欏啓浠g爜锛岃鈥滅煡鍏剁劧鈥濄1銆併奐ava缂栫▼鎬濇兂銆嬪畠瀵逛簬鍩烘湰鐨勯潰鍚...
  • 鎬庢牱鎵嶅彲浠ュ啓濂Java绋嬪簭?
    绛旓細濡備綍鎵嶈兘澶熷啓濂Java绋嬪簭?閭d箞灏辩涓嶅紑浣犵殑妯′豢锛屾瘮濡備綘鍙互涔板嚑鏈粡鍏哥殑Java缂栫▼涔︼紝鎶婁功涓婃墍鏈変緥绋嬪叏閮ㄩ噸鏂板啓涓閬嶏紝閫愪釜姣旇緝鍜屼功涓婅寖渚嬬殑宸窛锛屼竴姝ヤ竴姝ユ敼鍠勮嚜宸辩紪绋嬬殑椋庢牸鍜屾妧宸с傛椂闂撮暱浜嗭紝鑷劧灏辫兘鍐欏嚭鍍忎功涓婁緥绋嬩竴鏍风殑浠g爜锛岀敋鑷冲彲浠ユ瘮涔︿笂鍐欏緱濂斤紝浣犵殑Java绋嬪簭缂栧啓鑷劧鏄細瓒婃潵瓒婃鐨勩
  • 鎬庝箞鍐濂Java浠g爜?
    绛旓細濡備綍鎻愬崌Java浠g爜姘村钩?棣栧厛浣犵殑Java鐞嗚鍩虹灏变笉鑳藉緢宸紝涓涓狫ava鐞嗚鍩虹寰堝樊鐨凧ava绋嬪簭鍛橈紝閭d箞浣犳湡寰呬粬鐨凧ava浠g爜姘村钩寰堝ソ锛岃繖鏄笉鏄湁鐐硅楝间簡銆傛墍浠ava浠g爜姘村钩瑕佸ソ锛岄偅涔堝氨寰桱ava鐞嗚濂藉ソ鍦版帉鎻″ソ锛屽苟涓旀槸鐞嗚+浠g爜鏁插啓锛屼竴璧疯繘姝ワ紝涓嶈鎯崇潃鎴戜笉閲嶈Java鐞嗚锛孞ava浠g爜渚濇棫鑳藉鏁插緱璧烽銆傚叾娆★紝...
  • 扩展阅读:java入门网站 ... 初学编程必背50个 ... javascript入门 ... 第一章掌握javascript基础 ... java自学要学多久 ... java的开发过程 ... 学java一般能干什么 ... java交互 ... mac开发java感受 ...

    本站交流只代表网友个人观点,与本站立场无关
    欢迎反馈与建议,请联系电邮
    2024© 车视网