package llf
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ListBuffer
/**
* Created by sendoh on 2015/6/26.
*/
object SQLText {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
if (args.length != 3) {
println("Usage: java -jar code.jar dependency_jars file_location save_location")
System.exit(0)
}
val jars = ListBuffer[String]()
args(0).split(',').map(jars += _)
val conf = new SparkConf().setAppName("SQLText").setMaster("local[2]").setSparkHome("/usr/local/spark-1.2.0-bin-hadoop2.4").setJars(jars)
val sc = new SparkContext(conf)
val ssc = new SQLContext(sc)
import ssc.createSchemaRDD
case class SOOU(ID: Int, username: String, dataID: String, local: String, url: String, time: Int)
val Textdata = sc.textFile("hdfs://localhost:9000/datatnt/Soou.txt").map(_.split(','))
.map(p => SOOU(p(0).toInt,p(1),p(2),p(3),p(4),p(5).trim.toInt)).registerTempTable("textdata")
//
//查询有多少行数据
val T1 = ssc.sql("SELECT count(*) FROM textdata").collect().foreach(println)
//显示前100行数据
val T2 = ssc.sql("SELECT * FROM textdata limit 100").collect().foreach(println)
//搜索ID排名第1,但是数据ID排在第3的数据有多少?
val T3 = ssc.sql("SELECT count(*) FROM textdata WHERE ID = 1 and dataID = 3").collect().foreach(println)
//搜索用户点击的URL含china的数据有多少?
val T4 = ssc.sql("SELECT count(*) FROM textdata WHERE url like '%china%'").collect().foreach(println)
//搜索ID排名第1,但是数据ID排在第4,URL含china的数据有多少?
val T5 = ssc.sql("SELECT count(*) FROM textdata WHERE ID = 1 and dataID = 4 and url like '%china%'").collect().foreach(println)
//
//练练手,没有写保存本地文件
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package llf
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ListBuffer
/**
* Created by sendoh on 2015/6/26.
*/
object SQLLOOL {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
if (args.length != 3) {
println("Usage: java -jar code.jar dependency_jars file_location save_location")
System.exit(0)
}
val jars = ListBuffer[String]()
args(0).split(',').map(jars += _)
//设置运行环境
val conf = new SparkConf().setAppName("SQLLOOL").setMaster("local[2]").setSparkHome("/usr/local/spark-1.2.0-bin-hadoop2.4").setJars(jars)
val sc = new SparkContext(conf)
val ssc = new SQLContext(sc)
//通过sparkSQL查出每个店的销售数量和金额
import ssc.createSchemaRDD
//Date定义了日期的分类,将每天分别赋予所属的月份、星期、季度等属性
//日期、年月、年、月、日、周几、第几周、季度、旬、半月
case class Date(dateID: String, theyearmonth: String, theyear: String, themonth: String, thedate: String,
theweek: String, theweeks: String, thequot: String, thetenday: String, thehalfmonth: String)
//Stock文件定义了订单表头
//订单号、交易位置、交易日期
case class Stock(ordernumber: String, locationid: String, dateID: String)
//StockDetail文件定义了订单明细
//订单号、行号、货品、数量、金额
case class StockDetail(ordernumber: String, itemid: String, rownum: Int, qty: Int, price: Int, amount: Int)
//生成临时表
val saledata = sc.textFile("hdfs://localhost:9000/datatnt/sale.txt").map(_.split(",")).map(p => Date(p(0), p(1), p(2), p(3), p(4)
, p(5), p(6), p(7), p(8), p(9).trim)).registerTempTable("SALEDATA")
val salestock = sc.textFile("hdfs://localhost:9000/datatnt/stock.txt").map(_.split(",")).map(t =>
Stock(t(0), t(1), t(2).trim)).registerTempTable("SALESTOCK")
val salestockdetail = sc.textFile("hdfs://localhost:9000/datatnt/stockdetail.txt").map(_.split(",")).map(y =>
StockDetail(y(0), y(1), y(2).toInt, y(3).toInt, y(4).toInt, y(5).trim.toInt)).registerTempTable("SALESTOCKDETAIL")
//
//查找异常数据
val T1 = ssc.sql("SELECT sum(c.amount) FROM Stock b,StockDetail c WHERE b.ordernumber=c.ordernumber").collect().foreach(println)
//
val T2 = ssc.sql("SELECT sum(c.amount) FROM Stock b,StockDetail c,Date a where b.ordernumber=c.ordernumber and b.dateID=a.dateID").collect().foreach(println)
//
val T3 = ssc.sql("SELECT b.* FROM Stock b where b.dateID not in (SELECT dateID FROM Date)").collect().foreach(println)
//
//所有订单中每年的销售单数、销售总额
val T4 = ssc.sql("SELECT c.theyear,count(distinct a.ordernumber),sum(b.amount) FROM Stock a, StockDetail b," +
" Date c WHERE a.ordernumber=b.ordernumber and c.dateID=a.dateID GROUP BY c.theyear ORDER BY c.theyear").collect().foreach(println)
//所有订单中季度销售额前10位
val T5 = ssc.sql("SELECT c.theyear,c.thequot,sum(b.amount) AS sumofamount FROM Stock a,StockDetail b," +
"Date c WHERE a.ordernumber=b.ordernumber and a.dateID=c.dateID GROUP BY c.theyear,c.thequot ORDER BY sumofamount desc limit 10").collect().foreach(println)
//列出销售金额在100000以上的单据
val T6 = ssc.sql("SELECT a.ordernumber,sum(b.amount) AS sumofamount FROM Stock a," +
"StockDetail b WHERE a.ordernumber=b.ordernumber GROUP BY a.ordernumber HAVING sumofamount>100000").collect().foreach(println)
}
}