概述
1.需要的jar包:1)httpclient-4.5系列jar包;2)jsoup-1.6.1.jar
2.写在前面:基本的思路是模拟一个类似商城搜索的功能,封装好后可以做到一次搜索返回几个商城相同商品的信息,说高大上一点,就是小型比价系统。
3.示例
3.1 抓取京东商城商品
3.1.1
先上结果
第1件商品:
商品:New balance/NB 热男鞋复古跑鞋NB新款生活休闲鞋运动鞋GM500NSG 黑色 44
价格: 249.00元
第2件商品:
商品:New Balance NB 女鞋 跑步鞋 休闲鞋WL373SGL/SKM/SNG 藏青色WL373SNG 39/8/250MM
价格:249.00元
第3件商品:
商品:New Balance NB 男鞋 经典复古鞋 休闲鞋M368LBK/LBR 黑色M368LBK 44/10/280MM
价格: 249.50元
第4件商品:
商品:New balance/NB热男鞋复古跑步鞋2015新款休闲鞋500运动鞋GM500GSB 灰色 44
价格: 259.50元
……………………………………
3.1.2 代码片段(仅供测试,请勿它用)
public class JingDongProduct implements Product{
private String qury = null; //请求的keyword相当于搜索框输入的词
private String sort = ""; //商品结果排序类型
@Override
public void setQury(String qury) {
this.qury = qury;
}
@Override
public String getQury(){
return qury;
}
@Override
public void setSortStyle(String sort) {
this.sort = sort;
}
@Override
public String getSortStyle() {
return sort;
}
@Override
public String getMessage() throws Exception {
String result = null;
try{
HttpClientBuilder builder = HttpClients.custom();
builder.setUserAgent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:0.9.4)");
CloseableHttpClient httpClient = builder.build();
URI uri = new URI("http","search.jd.com","/Search",
"keyword="+getQury()+getSortStyle()+"&enc=utf- 8&pvid=oy1a1hii.kd9cf",null); //-------------------(1)
HttpGet httpget = new HttpGet(uri);
httpget.addHeader("Referer","http://www.jd.com/");//---(2)
CloseableHttpResponse response = httpClient.execute(httpget);
HttpEntity entity = response.getEntity();
if (entity != null){
result = EntityUtils.toString(entity,"utf-8");
EntityUtils.consume(entity);
}
response.close();
httpClient.close();
}catch(ClientProtocolException cpe){
cpe.printStackTrace();
}catch(IOException ioe){
ioe.printStackTrace();
}
//价格
Document doc = Jsoup.parse(result);
Elements e1 = doc.select("[class=p-price]");
List<Element> prices = e1.select("i");
//商品名称
Elements e2 = doc.select("[class=p-name p-name-type-2]");
List<Element> products = e2.select("em");
StringBuffer buffer = new StringBuffer();
for(int i=0;i<products.size();i++){
buffer.append("第"+(i+1)+"件商品:"+"rn");
String product = products.get(i).siblingElements().text();
buffer.append("商品:"+product+"rn");
String price = prices.get(i).siblingElements().text().substring(1);
buffer.append("价格:"+price+"元"+"rn");
buffer.append("rn");
System.out.println("第"+(i+1)+"件商品:");
System.out.println("商品:"+product);
System.out.println("价格:"+price+"元");
System.out.println();
}
return buffer.toString();
}
@Override
public void saveToLocal(String result,String keyword, String sortStyle) throws IOException {
if(sortStyle.contains("1")){
sortStyle = "descend";
}else if(sortStyle.contains("2")){
sortStyle = "ascend";
}else if(sortStyle.contains("3")){
sortStyle = "sale";
}else if(sortStyle.contains("4")){
sortStyle = "criticism";
}else if(sortStyle.contains("5")){
sortStyle = "new";
}else{
sortStyle = "com";
}
Writer writer = new BufferedWriter(new OutputStreamWriter
(new FileOutputStream("tmp\"+keyword+"-"+sortStyle+"-"+"jindong.txt"),"gbk"));
writer.write(result);
writer.close();
}
public static void main(String[] args) throws Exception{
double begin = System.currentTimeMillis();
String keyword = "新百伦";
String sortStyle = jascendprice; //结果按价格升序排
JingDongProduct j = new JingDongProduct();
j.setSortStyle(sortStyle);
j.setQury(keyword);
String result = j.getMessage(); //打印并返回结果
j.saveToLocal(result, keyword, sortStyle); //保存至本地
double timeConsume = System.currentTimeMillis() - begin;
System.out.println("耗时:" + timeConsume/1000 + "秒");
}
}
3.1.3
比较关键的就是(1)处和(2)处。前者把URL地址包装成URI形式,最主要在于把搜索关键词部分keyword及排序类型sortStyle抽取出来,后面的字符串不能省略,不然会出错。因为这些商城页面都是动态的,不能直接抓取,所以(2)处加入了跳转模拟人工访问。
3.2 抓取天猫商城商品
3.2.1 结果
第1件商品:
商品:New Balance 2015年新款男子支撑避震系列MR450CD3/MR450CG3
价格:249.00元
商家:top运动名品专营店
月成交额:8笔
第2件商品:
商品:New Balance 2015年新款中性复古鞋ML373SBB/ML373SRR
价格:249.00元
商家:top运动名品专营店
月成交额:0笔
第3件商品:
商品:New Balance/NB 女款长袖针织连帽外套 运动衫休闲外套AWJ53506
价格:258.30元
商家:New Balance旗舰店
月成交额:96笔
第4件商品:
商品:New Balance 2015年新款 中性373系列复古鞋ML373SNR
价格:275.00元
商家:top运动名品专营店
月成交额:32笔
3.2.2 代码(仅供测试,请勿它用)
public class TianMaoProduct implements Product{
private String qury = null;
private String sort = "";
@Override
public void setQury(String qury) {
this.qury = qury;
}
@Override
public String getQury(){
return qury;
}
@Override
public void setSortStyle(String sort){
this.sort = sort;
}
@Override
public String getSortStyle(){
return sort;
}
@Override
public String getMessage()throws Exception {
String result = null;
try{
HttpClientBuilder builder = HttpClients.custom();
builder.setUserAgent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:0.9.4)");
CloseableHttpClient httpClient = builder.build();
URI uri = new URI("https","list.tmall.com","/search_product.htm",
"q="+getQury()+getSortStyle(),null); //--(1)
HttpGet httpget = new HttpGet(uri);
httpget.addHeader("Referer","https://www.tmall.com/?spm=a220m.1000858.a2226mz.1.RzPkM0"); //----------(2)
CloseableHttpResponse response = httpClient.execute(httpget);
HttpEntity entity = response.getEntity();
if (entity != null){
result = EntityUtils.toString(entity,"gbk");
EntityUtils.consume(entity);
}
response.close();
httpClient.close();
}catch(ClientProtocolException cpe){
cpe.printStackTrace();
}catch(IOException ioe){
ioe.printStackTrace();
}
//价格
Document doc = Jsoup.parse(result);
Elements e1 = doc.select("[class=productPrice]");
List<Element> prices = e1.select("em");
//商品名称
Elements e2 = doc.select("[class=productTitle]");
List<Element> products = e2.select("a");
//商店
Elements e3 = doc.select("[class=productShop]");
List<Element> shops = e3.select("a");
//月成交额
Elements e4 = doc.select("[class=productStatus]");
List<Element> status = e4.select("em");
StringBuffer buffer = new StringBuffer();
for(int i=0;i<products.size();i++){
buffer.append("第"+(i+1)+"件商品:"+"rn");
String product = products.get(i).siblingElements().text();
buffer.append("商品:"+product+"rn");
String price = prices.get(i).siblingElements().text().substring(1);
buffer.append("价格:"+price+"元"+"rn");
String shop = shops.get(i).siblingElements().text();
buffer.append("商家:"+shop+"rn");
String statu = status.get(i).siblingElements().text();
buffer.append("月成交额:"+statu+"rn");
buffer.append("rn");
System.out.println("第"+(i+1)+"件商品:");
System.out.println("商品:"+product);
System.out.println("价格:"+price+"元");
System.out.println("商家:"+shop);
System.out.println("月成交额:"+statu);
System.out.println();
}
return buffer.toString();
}
@Override
public void saveToLocal(String result,String keyword,String sortStyle) throws IOException{
if(sortStyle.contains("pd")){
sortStyle = "descend";
}else if(sortStyle.contains("p")){
sortStyle = "ascend";
}else if(sortStyle.contains("sort=d")){
sortStyle = "sale";
}else if(sortStyle.contains("sort=rq")){
sortStyle = "popularity";
}else if(sortStyle.contains("new")){
sortStyle = "new";
}else if(sortStyle.contains("sort=s")){
sortStyle = "com";
}else {
sortStyle = "qita";
}
Writer writer = new BufferedWriter(new OutputStreamWriter
(new FileOutputStream("tmp\"+keyword+"-"+sortStyle+"-"+"tianmao.txt"), "gbk"));
writer.write(result);
writer.close();
}
public static void main(String[] args) throws Exception{
double begin = System.currentTimeMillis();
String keyword = "新百伦";
String sortStyle = tascendprice; //按价格升序
TianMaoProduct t = new TianMaoProduct();
t.setSortStyle(sortStyle);
t.setQury(keyword);
String result = t.getMessage();
t.saveToLocal(result,keyword ,sortStyle);
double timeConsume = System.currentTimeMillis() - begin;
System.out.println("耗时:" + timeConsume/1000 + "秒");
}
}
3.2.3
跟上例相同,(1)和(2)处是关键,不同的是此处的(1)不同写全uri的qury部分,天猫商城的服务端识别到请求后应该是会自动加上那些省略的部分,所以可以偷懒不写,当然写上也不会怀孕。(2)处没什么好说的,同样是设置跳转。
4 其实还有一个接口类,现在丢上来(仅供测试,请勿它用)
public interface Product {
//tianmao---
static final String tcom = "&sort=s"; //按综合
static final String tsales = "&sort=d"; //按销量-降序
static final String tascendprice = "&sort=p"; //按价格-升序
static final String tdescendprice = "&sort=pd"; //按价格-降序
static final String tpopularity = "&sort=rq"; //按人气降序
static final String tnew = "&sort=new"; //按新品-降序
//jingdong--
static final String jcom = ""; //按综合
static final String jdescendprice = "&psort=1"; //按价格-降序
static final String jascendprice = "&psort=2"; //按价格-升序
static final String jsales = "&psort=3"; //按销量-降序
static final String criticismNum = "&psort=4"; //按评论数-降序
static final String jnew = "&psort=5"; //按新品-降序
void setQury(String qury); //设置搜索词
String getQury();
void setSortStyle(String sort); //设置排序类型
String getSortStyle();
String getMessage()throws Exception;
//保存数据至本地
void saveToLocal(String result, String keyword, String sortStyle) throws IOException;
}
5 后话
之前听说httpclient只能抓静态页面,一天下午突然心血来潮想试试能不能抓到天猫的数据,折腾了半个下午终于发现是可以的。但个人觉得主要还存在几个问题。
1)在搜索出来的首页,京东页面的销量数据是看不到的,怎么抓取这部分数据呢,抑或是抓评论数来替代。
2)图片暂时没理它
2)数据保存问题
最后
以上就是甜蜜路人为你收集整理的爬取天猫京东实例的全部内容,希望文章能够帮你解决爬取天猫京东实例所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复