spark MLlib DataType ML中的数据类型

package ML.DataType;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.linalg.*;
import org.apache.spark.mllib.linalg.distributed.*;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils; import java.util.Arrays; /**
* TODO
*
* @ClassName: DataType
* @author: DingH
* @since: 2019/4/3 10:06
*/
public class DataType {
public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("Datatype");
JavaSparkContext javaSparkContext = new JavaSparkContext(conf); /**
* @Title: vectors.dense方法生成向量,sparse生成稀疏向量。第一个3是向量的大小,第二个列表是不为0的下表,第三个是对应的value.
*/
Vector dense = Vectors.dense(1.0, 0.0, 3.0);
Vector sparse = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 3.0}); /**
* @Title: 对向量进行标记,1.0为正,0.0为负
*/
LabeledPoint labeledPoint = new LabeledPoint(1.0, dense);
LabeledPoint labeledPoint1 = new LabeledPoint(0.0, sparse); /**
* @Title: libSVM文件: lable1 index1:value1 index2:value2
*/
JavaRDD<LabeledPoint> labeledPointJavaRDD = MLUtils.loadLibSVMFile(javaSparkContext.sc(), "/data...").toJavaRDD(); /**
* @Title: matricex.dense生成矩阵。3*2的矩阵 列式优先
* [1.0 2.0
* 3.0 4.0
* 5.0 6.0]
*/
Matrix dense1 = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); /**
* @Title: matricex.sparse生成稀疏矩阵。3*2的矩阵。第三个参数和第四个参数对应为不为0的元素。
* [9 0
* 0 6
* 0 8] 第三个参数: 1-0=1,3-1=2,每列不为0的元素分别是1个和2个。 第四个参数,从头开始遍历行,不为0的行。
*/
Matrix sparse1 = Matrices.sparse(3, 2, new int[]{0, 1, 3}, new int[]{0, 2, 1}, new double[]{9, 6, 8}); /**
* @Title: Rowmatrix
*/
JavaRDD<Vector> parallelize = javaSparkContext.parallelize(Arrays.asList(
Vectors.dense(1, 2, 3),
Vectors.dense(2, 3, 4),
Vectors.dense(3, 4, 5)
));
RowMatrix rowMatrix = new RowMatrix(parallelize.rdd());
long l = rowMatrix.numRows();
long l1 = rowMatrix.numCols();
QRDecomposition<RowMatrix, Matrix> rowMatrixMatrixQRDecomposition = rowMatrix.tallSkinnyQR(true); /**
* @Title: IndexedRowMatrix
*/
JavaRDD<IndexedRow> parallelize1 = javaSparkContext.parallelize(Arrays.asList(
new IndexedRow(1, dense),
new IndexedRow(2, dense),
new IndexedRow(3, dense)
));
IndexedRowMatrix indexedRowMatrix = new IndexedRowMatrix(parallelize1.rdd());
long l2 = indexedRowMatrix.numCols();
long l3 = indexedRowMatrix.numRows();
RowMatrix rowMatrix1 = indexedRowMatrix.toRowMatrix(); /**
* @Title: CoordinateMatrix
*/
JavaRDD<MatrixEntry> parallelize2 = javaSparkContext.parallelize(Arrays.asList(
new MatrixEntry(0, 1, 3),
new MatrixEntry(1, 3, 1),
new MatrixEntry(2, 1, 1)
));
CoordinateMatrix coordinateMatrix = new CoordinateMatrix(parallelize2.rdd());
long l4 = coordinateMatrix.numCols();
long l5 = coordinateMatrix.numRows();
IndexedRowMatrix indexedRowMatrix1 = coordinateMatrix.toIndexedRowMatrix(); /**
* @Title: BlocakMatrix 。 toBlockMatrix可以设置参数,规定row,col的大小,默认1024*1024
*/
BlockMatrix cache = indexedRowMatrix.toBlockMatrix().cache();
BlockMatrix cache1 = coordinateMatrix.toBlockMatrix().cache();
cache.validate();
BlockMatrix multiply = cache.transpose().multiply(cache);
}
}
上一篇:java正则表达式验证汉字


下一篇:试水STF(smartphone test farm)