Java crawler crawls tmall Taobao Jingdong search page and product details

First identify the product url, distinguish the platform to extract the product number, and then crawl the data according to the platform with the product number.

1. guide pack

<!-- Crawler related Jar Packet dependency -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-ooxml</artifactId>
      <version>3.10-FINAL</version>
    </dependency>
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.3</version>
    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.11.3</version>
    </dependency>

    <dependency>
      <groupId>org.projectlombok</groupId>
      <artifactId>lombok</artifactId>
      <scope>provided</scope>
    </dependency>

2. Encapsulate return types and constants

Introduce lombok to inject @ Data to avoid duplicate code such as get set toString

package java1024.xyz.vo;

import lombok.Data;

/**

  • @author xivin
  • @email 1250402127@qq.com
  • @description
  • @date 2020/1/3br/>*/
    @Data
    public class UrlData {

    private int status;
    private String platform;
    private Long number;

}

package java1024.xyz.vo;

/**

import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.Data;

import java.io.Serializable;
import java.sql.Timestamp;

/**

  • @author xivin
  • @email 1250402127@qq.com
  • @description commodity entity class
  • @date 2020/1/3br/>*/
    @Data
    public class Product implements Serializable {

    private Long id;

    private Long number;

    private Float price;

    private Integer userId;

    private String url;

    private Integer platformId;

    private String title;

    private String describe;

    private Integer status;

    @JsonFormat( pattern="yyyy-MM-dd HH:mm:ss")
    private Timestamp createdAt;

    private Timestamp updatedAt;

}

##3. Start to encapsulate and identify url tool UrlUtils.java after preliminary work

/**

  • @author xivin
  • @email 1250402127@qq.com
  • @description
  • @date 2020/1/3
    */
    public class UrlUtils {

    public static UrlData analyseUrl(String url) {

    UrlData urlData = new UrlData();
    try {
    
        // Sentence blank
        if (StringUtils.isEmpty(url)) {
            urlData.setStatus(0);
            return urlData;
        }
    
        //Tmall
        if (url.contains(UrlConst.tmallUrlSign)) {
    
            urlData.setPlatform(UrlConst.tmallUrlSign);
            String numberStr = "";
    
            /**
             * The root path and parameters are as follows:
             * https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.8.27832a99AfoD5W&id=604433373792
             * In? Cut the question mark into two parts
             *
             */
            String[] roudAndParams = url.split("\\?");
    
            if (roudAndParams.length < 2) {
                urlData.setStatus(0);
                return urlData;
            }
    
            /**
             * Obtain the parameter string, and extract the commodity ID starting with id = by & splitting multiple parameters
             */
            String paramStr =  roudAndParams[1];
            String[] params = paramStr.split("&");
            for (int i = 0;i < params.length; i++) {
                if (params[i].startsWith("id=")) {
                    numberStr = params[i].split("id=")[1];
                    break;
                }
            }
    
            if (StringUtils.isEmpty(numberStr)) {
                urlData.setStatus(0);
                return urlData;
            }
    
            Long number = new Long(numberStr);
            urlData.setStatus(1);
            urlData.setNumber(number);
            return urlData;
    
        }
        //TaoBao
        else if (url.contains(UrlConst.taobaoUrlSign)) {
    
            urlData.setPlatform(UrlConst.taobaoUrlSign);
            String numberStr = "";
    
            /**
             * The root path and parameters are as follows:
             * https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.8.27832a99AfoD5W&id=604433373792
             * In? Cut the question mark into two parts
             *
             */
            String[] roudAndParams = url.split("\\?");
    
            if (roudAndParams.length < 2) {
                urlData.setStatus(0);
                return urlData;
            }
    
            /**
             * Obtain the parameter string, and extract the commodity ID starting with id = by & splitting multiple parameters
             */
            String paramStr =  roudAndParams[1];
            String[] params = paramStr.split("&");
            for (int i = 0;i < params.length; i++) {
                if (params[i].startsWith("id=")) {
                    numberStr = params[i].split("id=")[1];
                    break;
                }
            }
    
            if (StringUtils.isEmpty(numberStr)) {
                urlData.setStatus(0);
                return urlData;
            }
    
            Long number = new Long(numberStr);
            urlData.setStatus(1);
            urlData.setNumber(number);
            return urlData;
        }
        //Other
        else if (url.contains(UrlConst.jingdongUrlSign)) {
    
            urlData.setPlatform(UrlConst.jingdongUrlSign);
            String numberStr = "";
            String[] roudAndParams = url.split("jd\\.com/");
    
            if (roudAndParams.length < 2) {
                urlData.setStatus(0);
                return urlData;
            }
    
            String paramStr =  roudAndParams[1];
            String[] params = paramStr.split(".html");
            numberStr = params[0];
    
            if (StringUtils.isEmpty(numberStr)) {
                urlData.setStatus(0);
                return urlData;
            }
    
            Long number = new Long(numberStr);
            urlData.setStatus(1);
            urlData.setNumber(number);
            return urlData;
        }
        else {
            urlData.setStatus(0);
            return urlData;
        }
    }catch (Exception e) {
        e.printStackTrace();
        urlData.setStatus(0);
        return urlData;
    }

    }

    public static void main(String[] args) {

    String tmallUrl = "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.8.27832a99AfoD5W&id=604433373792&skuId=4233630160968&user_id=1776477331&cat_id=2&is_b=1&rn=2eff85a6a504024ee62222a0045d9ded";
    UrlData tmall = analyseUrl(tmallUrl);
    System.out.println("tmall = " + tmall);
    
    String taobaoUrl =  "https://s.taobao.com/search?spm=a230r.1.14.7.ade0695abTrJ6k&type=samestyle&app=i2i&rec_type=1&uniqpid=69915374&nid=604733501729";
    UrlData taobao = analyseUrl(taobaoUrl);
    System.out.println("taobao = " + taobao);
    
    String jdUrl = "https://item.jd.com/100004250098.html#none";
    UrlData jd = analyseUrl(jdUrl);
    System.out.println("jd = " + jd);

    }

}

##4. Method of crawling tmall

>Java crawler Description: create HttpClient, set request header, execute request, parse corresponding! Specific code also has corresponding analysis

public Product soupTmallDetailById(Long number) {

    try {

        // Website address for crawling product information
        String url = "https://chaoshi.detail.tmall.com/item.htm?id=" + number;
        // Dynamic simulation request data
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        // Simulated browser browsing (the value of user agent can be browsed through the browser to view the header file of the request)
        httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
        CloseableHttpResponse response = httpclient.execute(httpGet);
        // Get response status code
        int statusCode = response.getStatusLine().getStatusCode();
        try {
            HttpEntity entity = response.getEntity();
            // If the status response code is 200, get the html entity content or json file
            if (statusCode == 200) {
                String html = EntityUtils.toString(entity, Consts.UTF_8);
                // Extract HTML to get product information results
                Document doc = null;
                // doc gets all the data of the whole page
                doc = Jsoup.parse(html);
                //Output doc to see the source code of the obtained page
              //System.out.println(doc);
                // Check the source code of the product page through the browser, find the div tag where the information is located, and then analyze it step by step
                Element item = doc.select("div[class='tb-wrap']").get(0);
                //Elements liList = ulList.select("div[class='product']");
                // Cycle the data of liList (see the source code of the doc page to get the specific data value, which may change slightly)
                //System.out.println("item = " + item);
                Product product = new Product();
                //for (Element item : ulList) {
                    // Commodity ID
                try {
                    product.setNumber(number);
                    product.setPlatformId(1);
                    //String id = item.select("div[class='tb-detail-hd']").select("h1").attr("data-spm");
                    String title = item.select("div[class='tb-detail-hd']").select("h1").text();
                    product.setTitle(title);
                    product.setUrl(UrlConst.TMALL_PRODUCT_DETAIL+number);

                    System.out.println("commodity title: " + title);
                    //String priceStr = item.select("div[class='tm-price-panel']").select("div[class='tm-promo-type']").select("span[class='tm-price']").text();

                    return product;
                }catch (Exception e) {
                    product.setId(0L);
                    product.setTitle("Item does not exist");
                    return product;
                }
                // }
            }
        }catch (Exception e) {
            e.printStackTrace();
            Product product = new Product();
            product.setId(0L);
            product.setTitle("Item does not exist");
            return product;
        }

    }catch (Exception e) {
        e.printStackTrace();
    }

    return null;
}
##5. How to crawl the details of Jingdong products

public Product soupTaobaoDetailById(Long number) {

    try {

        // Website address for crawling product information
        String url = "https://item.taobao.com/item.htm?id=" + number;
        // Dynamic simulation request data
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        // Simulated browser browsing (the value of user agent can be browsed through the browser to view the header file of the request)
        httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
        CloseableHttpResponse response = httpclient.execute(httpGet);
        // Get response status code
        int statusCode = response.getStatusLine().getStatusCode();
        try {
            HttpEntity entity = response.getEntity();
            // If the status response code is 200, get the html entity content or json file
            if (statusCode == 200) {
                String html = EntityUtils.toString(entity, Consts.UTF_8);
                // Extract HTML to get product information results
                Document doc = null;
                // doc gets all the data of the whole page
                doc = Jsoup.parse(html);
                //Output doc to see the source code of the obtained page
                //System.out.println(doc);
                // Check the source code of the product page through the browser, find the div tag where the information is located, and then analyze it step by step
                Element item = doc.select("div[class='tb-item-info-r']").get(0);
                //Elements liList = ulList.select("div[class='product']");
                // Cycle the data of liList (see the source code of the doc page to get the specific data value, which may change slightly)
                //System.out.println("item = " + item);
                Product product = new Product();
                //for (Element item : ulList) {
                // Commodity ID
                try {
                    product.setNumber(number);
                    product.setPlatformId(2);
                    //String id = item.select("div[class='tb-detail-hd']").select("h1").attr("data-spm");
                    String title = item.select("div[class='tb-title']").select("h3").text();
                    product.setTitle(title);
                    product.setUrl(UrlConst.TAOBAO_PRODUCT_DETAIL+number);

                    System.out.println("commodity title: " + title);

                    return product;
                }catch (Exception e) {
                    product.setId(0L);
                    product.setTitle("Item does not exist");
                    return product;
                }
                // }
            }
        }catch (Exception e) {
            e.printStackTrace();
            Product product = new Product();
            product.setId(0L);
            product.setTitle("Item does not exist");
            return product;
        }

    }catch (Exception e) {
        e.printStackTrace();
    }

    return null;
}
##6. Tmall search function

public List<Product> soupTaobaoByKeyWord(String keyword) {

    try {

        String input = "Towel";
        // The website address that needs to crawl commodity information actually changes input to keyword
        String url = "https://list.tmall.com/search_product.htm?q=" + input;
        // Dynamic simulation request data
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        // Simulated browser browsing (the value of user agent can be browsed through the browser to view the header file of the request)
        httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
        CloseableHttpResponse response = httpclient.execute(httpGet);
        // Get response status code
        int statusCode = response.getStatusLine().getStatusCode();
        try {
            HttpEntity entity = response.getEntity();
            // If the status response code is 200, get the html entity content or json file
            if (statusCode == 200) {
                String html = EntityUtils.toString(entity, Consts.UTF_8);
                // Extract HTML to get product information results
                Document doc = null;
                // doc gets all the data of the whole page
                doc = Jsoup.parse(html);
                //Output doc to see the source code of the obtained page
        //      System.out.println(doc);
                // Check the source code of the product page through the browser, find the div tag where the information is located, and then analyze it step by step
                Elements ulList = doc.select("div[class='view grid-nosku']");
                Elements liList = ulList.select("div[class='product']");
                // Cycle the data of liList (see the source code of the doc page to get the specific data value, which may change slightly)
                for (Element item : liList) {
                    // Commodity ID
                    String id = item.select("div[class='product']").select("p[class='productStatus']").select("span[class='ww-light ww-small m_wangwang J_WangWang']").attr("data-item");
                    System.out.println("commodity ID: " + id);
                    // Trade name
                    String name = item.select("p[class='productTitle']").select("a").attr("title");
                    System.out.println("Commodity name:" + name);
                    // commodity price
                    String price = item.select("p[class='productPrice']").select("em").attr("title");
                    System.out.println("Commodity price:" + price);
                    // Commodity website
                    String goodsUrl = item.select("p[class='productTitle']").select("a").attr("href");
                    System.out.println("Product website:" + goodsUrl);
                    // Product image website
                    String imgUrl = item.select("div[class='productImg-wrap']").select("a").select("img").attr("data-ks-lazyload");
                    System.out.println("Product image website:" + imgUrl);
                    System.out.println("------------------------------------");
                }
                // Consume entities
                EntityUtils.consume(response.getEntity());
            } else {
                // Consume entities
                EntityUtils.consume(response.getEntity());
            }
        } finally {
            response.close();
        }
    }catch (Exception e) {
        e.printStackTrace();
    }

    return null;

}

##7. A website project of commodity historical price record completed by crawler technology - is it worth it? Price record website: GitHub address: https://github.com/xivinChen/zhi-de-ma

Tags: Java Lombok Windows JSON

Posted on Fri, 10 Jan 2020 07:15:48 -0800 by eerikk2