/* Name: Word Count example on Amazon EMR Doing word count on file present in S3 bucket Output: Number of occurances of words 'island' and 'the' */ import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.SparkConf import org.apache.log4j.Logger import org.apache.log4j.Level object WordCount { def main(args: Array[String]) { //Set logging level to ERROR Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) //Reading a local file on HDFS val myInput = "s3://elasticmapreduce/samples/wordcount/input" // Should be some file on your local HDFS val conf = new SparkConf().setAppName("Word Count") val sc = new SparkContext(conf) val inputData = sc.textFile(myInput, 2).cache() //Find words having words 'island' and 'the' val wordA = inputData.filter(line => line.contains("islands")).count() val wordB = inputData.filter(line => line.contains("the")).count println("Number of lines with word 'islands' %s".format(wordA)) println("Number of lines with word 'the' %s".format(wordB)) } }