#!/usr/bin/env python3
#上一行的作用参考https://www.jianshu.com/p/400c612381dd
from pyspark import SparkConf, SparkContext
index = 0
def getindex():
global index
index+=1
return index
def main():
conf = SparkConf().setMatser("local[1]").setAppName("FileSort")
sc = SparkContext(conf=conf)
lines = sc.textFile("file:///usr/local/spark/mycode/rdd/filesort/file*.text")
index = 0
result1 = lines.filter(lambda line:(len(line.strip())>0 ))
result2 = result1.map(lambda:x:(int(x.strip()),""))#去除首尾空格后变成键值对
result3 = result2.repartition(1)
result4 = result3.sortByKey(True)
result5 = result4.map(lambda x:x[0])
result6 = result5.map(lambda x:(getindex(),x))
result6.foreach(print)
result6.saveAsTextFile("file:///usr/local/spark/mycode/rdd/filesort/sortresult")
if__name__=='__main__':
main()
写博客确实有帮助
自己敲一遍体会还是不一样
来源于厦门大学数据课程自己总结