10 Hours Start Big Data: Chapter 6 - Hadoop Project Practice

Overview of User Behavior Log

User Behavior Log:
All behavioral data (access, browse, search, click, etc.) of the user each time he visits the website
User Behavior Trajectory, Traffic Log
Why Log User Access Behavior
Visits to Web pages
Viscosity of Web Sites
How logs are generated
Content of User Behavior Log
Account number
Access time zone
Client Browser
Access module
How to Jump
Wait a minute.
Log data content:
1) System attributes accessed: operating system, browser, etc.
2) Access features: URL clicked, referer from which url, residence time on the page, etc.
3) Access information: session_id, ip, etc.
The following is a log record

2013-05-19 13:00:00     http://www.taobao.com/17/?tracker_u=1624169&type=1 B58W48U4WKZCJ5D1T3Z9ZY88RU7QA7B1        http://hao.360.cn/ 

Significance of User Behavior Log Analysis

The eye of the website
The nerves of the website
Website Brain

Offline Data Processing Architecture (Emphasis: Architecture Diagrams are Important)

Data Processing Flow
1) Data acquisition
Flume: web log written to HDFS
2) Data cleaning
Dirty data
Spark, Hive, MapReduce or some other distributed computing framework
Data after cleaning can be stored in HDFS(Hive/Spark SQL)
3) Data Processing
Statistics and analysis of corresponding business according to our needs
Spark, Hive, MapReduce or some other distributed computing framework
4) Storage of processing results
` The results can be stored in RDBMS, NoSQL
5) Visualization of data
Graphical presentation: pie chart, bar chart, map, polyline chart
Project requirements
Statistics of browser visits to imooc master site access logs
User Agent Parsing Class Testing for Functional Implementation
Collecting browser information based on log information
Statistics for different browsers
Open source projects use the User Agent Parser of github.com/yammer/user_agent
clone comes down to the local area
maven Compiles MVN clean package-DskipTest
maven install MVN clean install-DskipTest
Add parsing dependencies to pom.xml

<!--Add to UserAgent Analytical Dependence-->

Writing test classes

	 * UserAgent Test class
	public class UserAgentTest{
	  // Unit Testing: Use of UserAgent Tool Class
	  public void testUserAgentParser(){
	    String source = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
	    UserAgentParser userAgentParser = new UserAgentParser();
	    UserAgent agent = userAgentParser.parse(source);
	    // Browser Information
	    String browser = agent.getBrowser();
	    // There is a lot of information you can choose to print on your own.

Single-machine Local Completion Requirement Statistics for Functional Implementation
There is a little bit of data, take 100 head-n 100,000_access.log > 100_access.log to see how many lines wc-l 100_access.log
Self-written code, java stand-alone version test

	public class UserAgentTest{
	    public void testReadFile() throws Exception{
	        String path = "/users/rocky/data/imooc/100_access.log";
	        BufferReader reader = new BufferReader(
	          new InputStreamReader(new FileInputStream(new File(path)))
	      	String line = "";
	        int i = 0;
	      	Map<String,Integer> browserMap = new HashMap<String,Integer>();
	        UserAgentParser userAgentParser = new UserAgentParser();
	        while(line != null){
	        line = reader.readLine(); // Read in one row at a time
	          String source = line.subString(getCharacterPosition(value,"\"",7)) + 1;
	          UserAgent agent = userAgentParser.parse(source);
	          // Browser information, there is a lot of information you can choose to print
	    	  String browser = agent.getBrowser();
	  		  int browserValue = browserMap.get(browser);
	          if (browserValue != null){
	            browserMap.put(browser,browserValue + 1)
	          }else {
	             browserMap.put(browser, 1)
	      System.out.println("Total:" + i + "Row data");
	      for (Map.Entry<String,Integer> entry : browserMap.entrySet()){
	  // Test the custom method getCharacterPosition
	  public void testGetCharacterPosition(){
	    String value = "......";
	    // Gets the position of the seventh "double quotation mark" in the string
	    int index = getCharacterPosition(value,"\"",7);
	  // Gets the index position of the occurrence of the specified identifier string in the specified string
	  private int getCharacterPosition(String value, String operator, int index){
	    Matcher slashMatcher = Pattern.compile(operator).matcher(value);
	    int mIdx = 0;
	      if(mIdx == index){
	    return slashMatcher.start();

Use mapreduce to complete requirement statistics

Use plug-ins to package UserAgent into code to mvn assembly:assembly

package com.imooc.hadoop.project;

import com.kumkee.userAgent.UserAgent;
import com.kumkee.userAgent.UserAgentParser;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogApp
	public static class MyReducer extends Reducer

		protected void reduce(Text key, Iterable values, org.apache.hadoop.mapreduce.Reducer.Context context)
			throws IOException, InterruptedException
			long sum = 0L;
			for (Iterator iterator = values.iterator(); iterator.hasNext();)
				LongWritable value = (LongWritable)iterator.next();
				sum += value.get();

			context.write(key, new LongWritable(sum));

		protected volatile void reduce(Object obj, Iterable iterable, org.apache.hadoop.mapreduce.Reducer.Context context)
			throws IOException, InterruptedException
			reduce((Text)obj, iterable, context);

		public MyReducer()

	public static class MyMapper extends Mapper

		LongWritable one;
		private UserAgentParser userAgentParser;

		protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
			throws IOException, InterruptedException
			userAgentParser = new UserAgentParser();

		protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper.Context context)
			throws IOException, InterruptedException
			String line = value.toString();
			String source = (new StringBuilder()).append(line.substring(LogApp.getCharacterPosition(line, "\"", 7))).append(1).toString();
			UserAgent agent = userAgentParser.parse(source);
			String browser = agent.getBrowser();
			context.write(new Text(browser), one);

		protected void cleanup(org.apache.hadoop.mapreduce.Mapper.Context context)
			throws IOException, InterruptedException
			userAgentParser = null;

		protected volatile void map(Object obj, Object obj1, org.apache.hadoop.mapreduce.Mapper.Context context)
			throws IOException, InterruptedException
			map((LongWritable)obj, (Text)obj1, context);

		public MyMapper()
			one = new LongWritable(1L);

	public LogApp()

	private static int getCharacterPosition(String value, String operator, int index)
		Matcher slashMatcher = Pattern.compile(operator).matcher(value);
		for (int mIdx = 0; slashMatcher.find() && ++mIdx != index;);
		return slashMatcher.start();

	public static void main(String args[])
		throws Exception
		Configuration configuration = new Configuration();
		Path outputPath = new Path(args[1]);
		FileSystem fileSystem = FileSystem.get(configuration);
		if (fileSystem.exists(outputPath))
			fileSystem.delete(outputPath, true);
			System.out.println("output file exists, but is has deleted");
		Job job = Job.getInstance(configuration, "LogApp");
		FileInputFormat.setInputPaths(job, new Path[] {
			new Path(args[0])
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);


Executing jobs on YARN
hadoop jar /home/hadoop/lib/hadoop-train-1.0-jar-with-dependencies.jar com.imooc.hadoop.project.LogApp /10000_access.log /browserout

Tags: Hadoop Apache Java Spark

Posted on Sun, 25 Aug 2019 22:34:59 -0700 by verdrm