spark对本地文件中的数字进行排序并标注序号

#!/usr/bin/env python3
#上一行的作用参考https://www.jianshu.com/p/400c612381dd

from pyspark import SparkConf, SparkContext

index = 0

def getindex():
  global index
  index+=1
  return index
def main():
  conf = SparkConf().setMatser("local[1]").setAppName("FileSort")
  sc = SparkContext(conf=conf)
  lines = sc.textFile("file:///usr/local/spark/mycode/rdd/filesort/file*.text")
  index = 0
  result1 = lines.filter(lambda line:(len(line.strip())>0 ))
  result2 = result1.map(lambda:x:(int(x.strip()),""))#去除首尾空格后变成键值对
  result3 = result2.repartition(1)
  result4 = result3.sortByKey(True)
  result5 = result4.map(lambda x:x[0])
  result6 = result5.map(lambda x:(getindex(),x))
  result6.foreach(print)
  result6.saveAsTextFile("file:///usr/local/spark/mycode/rdd/filesort/sortresult")
  if__name__=='__main__':
    main()

写博客确实有帮助
自己敲一遍体会还是不一样
来源于厦门大学数据课程自己总结

发布了25 篇原创文章 · 获赞 0 · 访问量 386

猜你喜欢

转载自blog.csdn.net/qq_45371603/article/details/104585965