[Scala] 纯文本查看 复制代码
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.spark.HashPartitioner
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext.rddToPairRDDFunctions;//这一行没有显式的调用,但一定要引入,不然会报错
/**
* Leo Dj @2015-10-09
*/
object MultipleTextOutput {
def main(args: Array[String]) {
val filePath = "/user/input/operation.log";
val savePath = "/user/output/merge";
val conf = new SparkConf().setAppName("SplitTest")
val sc = new SparkContext(conf)
case class RDDMultipleTextOutputFormatter() extends MultipleTextOutputFormat[Any, Any] {
//自定义保存文件名
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
val separator = ",";
//取72列的值,作为文件名
key.asInstanceOf[String].split(separator)(72);
}
}
//读取文件后,不进行split操作,直接将整行内容看作key,
sc.textFile(filePath).map(x=>(x,"")).
partitionBy(new HashPartitioner(3)).saveAsHadoopFile(savePath, classOf[String], classOf[String], classOf[RDDMultipleTextOutputFormatter])
}
}