diff --git a/2-import/hive-import.sh b/2-import/hive-import.sh index 79aff2df083dbcb7eebe02be642aba35b5a95737..c4e152a00a709340817713fbb77c7631c412a688 100755 --- a/2-import/hive-import.sh +++ b/2-import/hive-import.sh @@ -2,13 +2,20 @@ DBNAME="`id -un`_statistics" JDBC_URL="jdbc:hive2://hador-c1.ics.muni.cz:10000/$DBNAME;principal=hive/hador-c1.ics.muni.cz@ICS.MUNI.CZ" +if [ -z "$YEAR" ]; then + YEAR=`date '+%Y'` +fi BASE_PATH="`dirname $0`" LOCAL_PATH="`pwd`" HDFS_PATH="/user/`id -un`/statistics" +OPTIONS_IMPORT='TBLPROPERTIES("skip.header.line.count"="1")' +OPTIONS_RCFILE='' -ACTIONS=${@:-'upload drop init'} +ACTIONS=${@:-'upload drop init process'} TABLES='jobs subjobs nodes jobnodes counters jobcounters intervals' +start_time=`TZ=C date --date="$((YEAR-1))-01-01" '+%s'` +end_time=`TZ=C date --date="$YEAR-01-01" '+%s'` echo "Target HDFS path: ${HDFS_PATH}" echo @@ -24,10 +31,22 @@ for action in ${ACTIONS}; do upload) hdfs dfs -rm -r ${HDFS_PATH} || : - hdfs dfs -mkdir -p ${HDFS_PATH}/jobs/ + hdfs dfs -mkdir -p ${HDFS_PATH}/jobs/ ${HDFS_PATH}/jobs-import/ for t in ${TABLES}; do - hdfs dfs -mkdir ${HDFS_PATH}/jobs/${t} || : 2>/dev/null - hdfs dfs -put ${LOCAL_PATH}/${t}.csv ${HDFS_PATH}/jobs/${t}/ + hdfs dfs -mkdir ${HDFS_PATH}/jobs/${t} ${HDFS_PATH}/jobs-import/${t} || : 2>/dev/null + hdfs dfs -put ${LOCAL_PATH}/${t}.csv ${HDFS_PATH}/jobs-import/${t}/ + done + ;; + + process) + for t in counters intervals nodes; do + beeline -u $JDBC_URL -e "INSERT OVERWRITE TABLE ${t} SELECT * FROM ${t}_import;" + done + for t in jobs subjobs; do + beeline -u $JDBC_URL -e "INSERT OVERWRITE TABLE ${t} SELECT * FROM ${t}_import WHERE finish >= ${start_time}000L AND start < ${end_time}000L;" + done + for t in jobcounters jobnodes; do + beeline -u $JDBC_URL -e "INSERT OVERWRITE TABLE ${t} SELECT * FROM ${t}_import WHERE jobid IN (SELECT jobid FROM jobs);" done ;; @@ -36,9 +55,23 @@ for action in ${ACTIONS}; do ;; init) - sed -e "s,@HDFS_PATH@,$HDFS_PATH,g" "${BASE_PATH}/hive.sql.in" > hive.sql - beeline -u $JDBC_URL -e "CREATE DATABASE $DBNAME" - beeline -u $JDBC_URL -f hive.sql + sed \ + -e "s,@HDFS_PATH@,$HDFS_PATH,g" \ + -e "s,@LOCAL_PATH@,jobs-import,g" \ + -e "s,@SUFFIX@,_import,g" \ + -e "s,@OPTIONS@,$OPTIONS_IMPORT,g" \ + "${BASE_PATH}/hive.sql.in" > hive.sql + + echo >> hive.sql + + sed \ + -e "s,@HDFS_PATH@,$HDFS_PATH,g" \ + -e "s,@LOCAL_PATH@,jobs,g" \ + -e "s,@SUFFIX@,,g" \ + -e "s,@OPTIONS@,,g" \ + "${BASE_PATH}/hive.sql.in" >> hive.sql + #beeline -u $JDBC_URL -e "CREATE DATABASE $DBNAME" + #beeline -u $JDBC_URL -f hive.sql ;; esac diff --git a/2-import/hive.sql.in b/2-import/hive.sql.in index a7b369cbfea5eb0a31cf2c7aae6bcbaee0586210..cef608d7d7c19829afa8cd60ec7d28ccc2f88cc7 100644 --- a/2-import/hive.sql.in +++ b/2-import/hive.sql.in @@ -1,4 +1,4 @@ -CREATE EXTERNAL TABLE jobs ( +CREATE EXTERNAL TABLE jobs@SUFFIX@ ( id CHAR(80), name CHAR(128), user CHAR(20), @@ -16,8 +16,8 @@ CREATE EXTERNAL TABLE jobs ( changed TIMESTAMP ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/jobs' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/jobs' +@OPTIONS@; CREATE EXTERNAL TABLE subjobs ( @@ -30,8 +30,8 @@ CREATE EXTERNAL TABLE subjobs ( finish BIGINT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/subjobs' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/subjobs' +@OPTIONS@; CREATE EXTERNAL TABLE jobnodes ( @@ -42,8 +42,8 @@ CREATE EXTERNAL TABLE jobnodes ( nreduce INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/jobnodes' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/jobnodes' +@OPTIONS@; CREATE EXTERNAL TABLE jobcounters ( @@ -55,8 +55,8 @@ CREATE EXTERNAL TABLE jobcounters ( total BIGINT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/jobcounters' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/jobcounters' +@OPTIONS@; CREATE EXTERNAL TABLE counters ( @@ -65,8 +65,8 @@ CREATE EXTERNAL TABLE counters ( name CHAR(128) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/counters' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/counters' +@OPTIONS@; CREATE EXTERNAL TABLE nodes ( @@ -74,8 +74,8 @@ CREATE EXTERNAL TABLE nodes ( host VARCHAR(256) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/nodes' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/nodes' +@OPTIONS@; CREATE EXTERNAL TABLE intervals ( @@ -83,5 +83,6 @@ CREATE EXTERNAL TABLE intervals ( finish INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' -STORED AS TEXTFILE LOCATION '@HDFS_PATH@/jobs/intervals/' -TBLPROPERTIES("skip.header.line.count"="1"); +STORED AS TEXTFILE LOCATION '@HDFS_PATH@/@LOCAL_PATH@/intervals' +@OPTIONS@; +