Advertisements
Home > Information Technology, NOSQL > Inverted Index Application on Hadoop MapReduce

Inverted Index Application on Hadoop MapReduce

For a sample input like this:

cutaneous asphyxied backframe beminstrel cyanurin bipartile aberdavine backboned anhydrization copies acanthus bedrugs Cardiacea Arcadian carburate apple-scented ast bi-guy Blandford balloonflower covenable calumniate bespreads confirming deathless beturbaned <a href=”http://4qpqohfmaofljosxtqbgxslgzxqxtz.html”>link</a&gt;

Code an inverted index application to determine occurrences of words in different web pages.

InvertedMapper.java

import java.io.IOException;

import java.util.ArrayList;

import java.util.StringTokenizer;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.MapReduceBase;

import org.apache.hadoop.mapred.Mapper;

import org.apache.hadoop.mapred.OutputCollector;

import org.apache.hadoop.mapred.Reporter;
/*

*

*

* @author beta13

*/

public class InvertedMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {

private Text word = new Text();

@Override

public void map(LongWritable key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throws IOException {

String line = value.toString();

StringTokenizer tokenizer = new StringTokenizer(line,” <>=\”\’&\\,;”);

ArrayList<String> b = new ArrayList<String>();

String a = “”;

while (tokenizer.hasMoreTokens()) {

String buffer = tokenizer.nextToken();

if(buffer.equalsIgnoreCase(“href”)) {

a = tokenizer.nextToken();

}

else {

b.add(buffer);

}

}

for(int i = 0; i < b.size(); i++) {

output.collect(new Text(b.get(i)), new Text(a));

}

}

}

InvertedReducer.java

import java.io.IOException;

import java.util.Iterator;

import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.MapReduceBase;

import org.apache.hadoop.mapred.OutputCollector;

import org.apache.hadoop.mapred.Reducer;

import org.apache.hadoop.mapred.Reporter;
/*

*

*

* @author beta13

*/

public class InvertedReducer extends MapReduceBase implements Reducer<Text,Text,Text,Text> {

public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {

String a = “”;

boolean first = true;

while (values.hasNext()) {

if(!first) {

a += “, “;

}

first = false;

a += values.next().toString();

}

output.collect(key, new Text(a));

}

}

InvertedJob.java

import java.util.concurrent.Callable;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.FileInputFormat;

import org.apache.hadoop.mapred.FileOutputFormat;

import org.apache.hadoop.mapred.JobClient;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapred.RunningJob;

import org.apache.hadoop.mapred.TextInputFormat;

import org.apache.hadoop.mapred.TextOutputFormat;

public class InvertedJob {

public static void main(String[] args) throws Exception {

JobConf conf = new JobConf(InvertedJob.class);

conf.setJobName(“Inverted Index”);

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(Text.class);

conf.setMapperClass(InvertedMapper.class);

conf.setReducerClass(InvertedReducer.class);

conf.setInputFormat(TextInputFormat.class);

conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(“/home/kaza-sou/Documents/TPCH/webpages/”));

FileOutputFormat.setOutputPath(conf, new Path(“/home/kaza-sou/Documents/TPCH/OUTPUT”));

JobClient.runJob(conf);

}

}

Result:

-ably http://82hzwjdieclcocndnrezcdmakasjzejlyrkuanceqogertielmeoqroarhjqltbjkmekzhoqw.html

-age http://87abnvxnbkifxfhxlowbapgktvyhbmnkbompjjcltlqrryocmlenbkxu.html, http://0hshaxfcdlmigokzsdmgqzkjfolopyjcfckfapiebvmij.html

Antntonioni http://92iuhvkfrnkefzibo.html, http://41uqdw.html, http://25jeldduaqtfjtolanssnmphkngucawbvgggkjbkaupkoevosogfavmkjtemtbooiuu.html

Antonie http://22abshxafgwyhcejdsdlrtsxsbqywjsvgswzauafbwejzjsdbwosqhqjjgigqabwtyvutwausedsnwvptki.html

in http://36zkbdvlthakfgkgbxmaallwgrs.html, http://40cnsbedenrjtmavobd.html

Advertisements
  1. September 27, 2011 at 2:20 am

    Particularly well executed article

  2. July 16, 2015 at 7:59 pm

    This post will help the internet users for creating new weblog or even a blog from start to end.

  1. No trackbacks yet.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

w

Connecting to %s

%d bloggers like this: