import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.stream.Stream; public class TextTest { public static void main(String[] args) { Path path1 = Paths.get("test.log"); Path path2 = Paths.get("test.log-2"); try (Streamstream = Files.lines(path1)) { Files.write(path2, (Iterable )stream.filter(s->s.trim().startsWith("{")).filter(s->s.trim().endsWith("}"))::iterator); } catch (IOException e) { e.printStackTrace(); } } }
JAVA8 Stream Filter 를 활용한 file read to file write
Hadoop hdfs Tajo 메모 중
공부 메모 중
[환경]
- centos 6.4
- java 7
- hadoop-2.7.3
- tajo-0.11.3
[구성]
server1 : NameNode, TajoMaster
server2 : DataNode, TajoWorker (SecondaryNameNode)
server3 : DataNode, TajoWorker
server4 : DataNode, TajoWorker
# 클러스터 구성은 생략 bin/hadoop namenode -format # 초기화 시 rm -Rf /tmp/hadoop-tomcat 하고 나서 bin/start-all.sh 접속 http://192.168.100:50070/dfshealth.html
-- 데이터 : {"reg_dt":1474350438172,"jsessionid":"c09db86d-22c1-464b-92af-d612d7274c66","url_now":"http://ddakker.pe.kr/b","click_page_url":"http://ddakker.pe.kr/a","ip":"192.168.0.100","user_key":"1234567890","url_before":"http://ddakker.pe.kr/a","session_first_time":1474289056726,"user_agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"} bin/tsql default> \dfs -mkdir /tajo default> \dfs -ls / Found 1 items drwxr-xr-x - hadoop supergroup 0 2016-09-20 15:27 /tajo default> \dfs -put /logs/weblog /tajo/weblog default> \dfs -du -s -h /tajo/weblog 1.5 G /tajo/weblog create external table weblog ( reg_dt INT8, url_before text, jsessionid text, ip text, url_now text, click_page_url text, user_key text, session_first_time INT8) USING JSON LOCATION 'hdfs:/tajo/weblog'; default> select count(*) from weblog; Progress: 0%, response time: 0.465 sec Progress: 0%, response time: 0.467 sec Progress: 0%, response time: 0.869 sec Progress: 0%, response time: 1.67 sec Progress: 46%, response time: 2.672 sec Progress: 100%, response time: 2.735 sec ?count ------------------------------- 3503135 (1 rows, 2.735 sec, 16 B selected)
Spark 메모중..
공부 메모 중...
[환경]
- centos 6.4
- java 7
- spark-2.0.0-bin-hadoop2.7
cd jars # json format... 관련.. wget http://www.congiu.net/hive-json-serde/1.3/cdh5/json-serde-1.3-jar-with-dependencies.jar sbin/start-thriftserver.sh bin/beeline -u jdbc:hive2://localhost:10000 echo "1|abc|1.1|a" >> test.csv echo "2|def|2.3|b" >> test.csv create table if not exists testCsv (id INT, name STRING, score FLOAT, type STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'; load data local inpath '/usr/local/tomcat/test.csv' into table testCsv; 0: jdbc:hive2://localhost:10000> select * from testCsv; +-----+-------+--------------------+-------+--+ | id | name | score | type | +-----+-------+--------------------+-------+--+ | 1 | abc | 1.100000023841858 | a | | 2 | def | 2.299999952316284 | b | +-----+-------+--------------------+-------+--+ echo "{id: 1, name: 'abc', score: 1.1, type: 'a'}" >> test.json echo "{id: 2, name: 'def', score: 2.2, type: 'b'}" >> test.json create table if not exists testJson (id INT, name STRING, score FLOAT, type STRING) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'; load data local inpath '/usr/local/tomcat/test.json' into table testJson; 0: jdbc:hive2://localhost:10000> select * from testJson; +-----+-------+--------------------+-------+--+ | id | name | score | type | +-----+-------+--------------------+-------+--+ | 1 | abc | 1.100000023841858 | a | | 2 | def | 2.200000047683716 | b | +-----+-------+--------------------+-------+--+