Big Data Visualization http://espinosa-oviedo.com/big-data-visualization-2016-bsc Tools and techniques for exploring data Thu, 05 May 2016 08:11:24 +0000 en-US hourly 1 https://wordpress.org/?v=5.3.4 106572429 Script http://espinosa-oviedo.com/big-data-visualization-2016-bsc/2016/05/05/script/?utm_source=rss&utm_medium=rss&utm_campaign=script Thu, 05 May 2016 08:11:24 +0000 http://espinosa-oviedo.com/big-data-visualization-2016-bsc/?p=197 ##################################################
## Developer Tools
##################################################

sudo apt-get install -y ssh build-essential git openjdk-7-jdk

##################################################
## Lightning-Server
##################################################
#————————————-
#– Docker-Engine: install
#————————————-
sudo apt-get update
sudo apt-get install apt-transport-https ca-certificates

sudo apt-key adv –keyserver hkp://p80.pool.sks-keyservers.net:80 –recv-keys 58118E89F3A912897C070ADBF76221572C52609D
echo “deb https://apt.dockerproject.org/repo ubuntu-trusty main” | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

sudo apt-get update
sudo apt-get install -y docker-engine
sudo apt-get -y autoremove

#————————————-
#– Docker-Engine: config & test
#————————————-
sudo usermod -aG docker $(whoami)
sudo docker run hello-world

#————————————-
#– Lightning: install
#————————————-
sudo docker run -i -t -p 3000:3000 lightningviz/lightning:latest

#————————————-
#– Lightning: config run-script
#————————————-
echo ‘#!/bin/bash’ | sudo tee -a /usr/local/bin/lightning-server > /dev/null
echo “docker run -i -t -p 3000:3000 lightningviz/lightning:latest” | sudo tee -a /usr/local/bin/lightning-server > /dev/null

sudo chmod +x /usr/local/bin/lightning-server

#————————————-
#– Reboot (needed for running docker without sudo)
#————————————-
sudo reboot

##################################################
## Rapidminer (manual install)
##################################################

# Download from https://rapidminer.com

#————————————-
#– Install (assumes rapidminer-studio is in current directory)
#————————————-
sudo mv rapidminer-studio /usr/local/rapidminer-studio

#————————————-
#– Config
#————————————-
echo ‘#!/bin/bash’ | sudo tee -a /usr/local/bin/rapidminer-studio > /dev/null
echo ‘export RAPIDMINER_HOME=/usr/local/rapidminer-studio/’ | sudo tee -a /usr/local/bin/rapidminer-studio > /dev/null
echo ‘/usr/local/rapidminer-studio/RapidMiner-Studio.sh’ | sudo tee -a /usr/local/bin/rapidminer-studio > /dev/null

sudo chmod +x /usr/local/bin/rapidminer-studio
##################################################
## NoSQL Databases
##################################################
#————————————-
#– MongoDB
#————————————-

sudo apt-key adv –keyserver hkp://keyserver.ubuntu.com:80 –recv EA312927
echo “deb http://repo.mongodb.org/apt/ubuntu trusty/mongodb-org/3.2 multiverse” | sudo tee /etc/apt/sources.list.d/mongodb-org-3.2.list
sudo apt-get update
sudo apt-get install -y mongodb-org

#————————————-
#– CouchDB
#————————————-

sudo apt-get update
sudo apt-get install software-properties-common -y
sudo add-apt-repository ppa:couchdb/stable -y
sudo apt-get update
sudo apt-get install couchdb -y
curl localhost:5984

# CouchDB Client 4 Python
pip install couchdb

##################################################
## Pig Latin
##################################################

# Download from http://pig.apache.org/releases.html

#————————————-
#– Install (assumes tar is in current directory)
#————————————-

tar -xvf pig-* && rm pig-*.tar*
sudo mkdir /usr/local/apache/
sudo mv pig-* /usr/local/apache/pig

printf “\n\n” | tee -a ~/.bashrc > /dev/null
echo ‘# Apache Pig Latin’ | tee -a ~/.bashrc > /dev/null
echo ‘export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-amd64/’ | tee -a ~/.bashrc > /dev/null
echo ‘export PIG_HOME=/usr/local/apache/pig’ | tee -a ~/.bashrc > /dev/null
echo ‘export PATH=$PATH:$PIG_HOME/bin’ | tee -a ~/.bashrc > /dev/null

source ~/.bashrc

##################################################
## Anaconda
##################################################

# Download from https://www.continuum.io/downloads
# & follow instructions

# when installed on different folder
chmod -R 777 /usr/local/anaconda2/

##################################################
## Cesium JS
##################################################

#————————————-
#– Node
#————————————-

curl -sL https://deb.nodesource.com/setup_4.x | sudo -E bash –
sudo apt-get install -y nodejs npm

sudo ln -s /usr/bin/nodejs /usr/local/bin/node
sudo ln -s /usr/bin/npm /usr/local/bin/npm
#————————————-
#– Cesium & Express
#————————————-
npm install cesium
npm install express compression request yargs
#————————————-
#– Google Chrome
#————————————-

# Manual install: https://www.google.com/chrome/browser/desktop/
##################################################
## VM Customization
##################################################

#————————————-
#– Big Data Analytics (bda) sudo user
#————————————-
PASSWORD=bigdata
sudo useradd -c ‘BD Analytics’ -p $(openssl passwd -1 $PASSWORD) bda
sudo usermod -aG sudo bda
sudo usermod -aG docker bda

sudo cp -R /home/$(whoami) /home/$(whoami).tmp
sudo mv /home/$(whoami).tmp /home/bda
sudo chown bda:bda /home/bda -R

#————————————-
#– Enable autologin (bda user)
#————————————-

sudo rm /etc/lightdm/lightdm.conf

echo ‘[SeatDefaults]’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-guest=false’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-user=bda’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-user-timeout=0’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-session=lightdm-autologin’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null

]]>
197
Answers Pig http://espinosa-oviedo.com/big-data-visualization-2016-bsc/2016/05/03/answers-pig/?utm_source=rss&utm_medium=rss&utm_campaign=answers-pig Tue, 03 May 2016 18:59:05 +0000 http://espinosa-oviedo.com/big-data-visualization-2016-bsc/?p=148
  • Filter the speedtest conducted in Barcelona or Madrid. Then list the internet providers working in those cities.
  • NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                      client_address: chararray,
                      client_country: chararray,
                      lon: float,
                      lat: float,
                      client_provider: chararray,
                      mlabservername:  chararray,
                      connect_time:    float,
                      download_speed:  float,
                      neubot_version:  float,
                      platform:        chararray,
                      remote_address:  chararray,
                      test_name:       chararray,
                      timestamp:       long,
                      upload_speed:    float,
                      latency:  float,
                      uuid:     chararray,
                      asnum:    chararray,
                      region:   chararray,
                      city:     chararray,
                      hour:     int,
                      month:    int,
                      year:     int,
                      weekday:  int,
                      day:      int,
                      filedate: chararray
    );
    
    --
    -- A1: Internet Providers in 'Barcelona' or 'Madrid' where speedtests were conducted
    --
    
    SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
    
    SpeedTests = FILTER @ BY (
        city matches '.*Barcelona.*' OR
        city matches '.*Madrid.*'
    );
    
    Providers = FOREACH @ GENERATE city, client_provider;
    Providers = DISTINCT @;
    
    DUMP @;
    • List the names and the IP ranges of the internet providers located in Barcelona. For this you need to use the IPtoNumber user defined function (cf. NeubotTestsUDFs.jar).
    NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                      client_address: chararray,
                      client_country: chararray,
                      lon: float,
                      lat: float,
                      client_provider: chararray,
                      mlabservername:  chararray,
                      connect_time:    float,
                      download_speed:  float,
                      neubot_version:  float,
                      platform:        chararray,
                      remote_address:  chararray,
                      test_name:       chararray,
                      timestamp:       long,
                      upload_speed:    float,
                      latency:  float,
                      uuid:     chararray,
                      asnum:    chararray,
                      region:   chararray,
                      city:     chararray,
                      hour:     int,
                      month:    int,
                      year:     int,
                      weekday:  int,
                      day:      int,
                      filedate: chararray
    );
    
    --
    -- A2: Internet Providers in Barcelona and their IP range based on the speedtests observations
    --
    
    SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
    
    SpeedTests = FILTER @ BY (
        city matches '.*Barcelona.*'
    );
    
    Providers = FOREACH @ GENERATE
        city,
        client_provider,
        IPtoNumber(client_address) AS ip
    ;
    
    Providers = GROUP @ BY client_provider;
    
    Providers_IP_Range = FOREACH @ GENERATE
        group,
        NumberToIP( MIN(Providers.ip) ),
        NumberToIP( MAX(Providers.ip) )
    ;
    
    DUMP @;
    • Group the speedtest based on the user network infrastructure (e.g., 3G/4G vs ADSL). For this you can assume some max bandwidth (e.g., 21Mb/sec for ADSL).
    NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                      client_address: chararray,
                      client_country: chararray,
                      lon: float,
                      lat: float,
                      client_provider: chararray,
                      mlabservername:  chararray,
                      connect_time:    float,
                      download_speed:  float,
                      neubot_version:  float,
                      platform:        chararray,
                      remote_address:  chararray,
                      test_name:       chararray,
                      timestamp:       long,
                      upload_speed:    float,
                      latency:  float,
                      uuid:     chararray,
                      asnum:    chararray,
                      region:   chararray,
                      city:     chararray,
                      hour:     int,
                      month:    int,
                      year:     int,
                      weekday:  int,
                      day:      int,
                      filedate: chararray
    );
    
    --
    -- A3: Speedtests (conducted in Barcelona) organized by network type: Mobile vs ADSL
    --
    
    SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
    SpeedTests = FILTER @ BY (
        city matches '.*Barcelona.*'
    );
    
    SPLIT @ INTO
        Mobile_Tests IF (
            download_speed > 21000000 -- 21 Mb / sec
        ),
    
        ADSL_Tests IF (
            download_speed <= 21000000 -- 21 Mb / sec
        )
    ;
    
    MobileSpeeds = FOREACH Mobile_Tests GENERATE
         CEIL(download_speed / 1000000) AS download_speed,
         'mobile' AS network_type: chararray
    ;
    
    ADSLSpeeds = FOREACH ADSL_Tests GENERATE
         CEIL(download_speed / 1000000) AS download_speed,
         'adsl' AS network_type: chararray
    ;
    
    Speeds = UNION MobileSpeeds, ADSLSpeeds;
    Speeds = GROUP @ BY (download_speed, network_type);
    Speeds = FOREACH @ GENERATE
         CONCAT( (chararray) group.download_speed, ' mb/sec' ),
         group.network_type,
         COUNT(Speeds)
    ;
    
    DUMP @;
    • Find the user that realized the maximum number of tests. For this user, produce a table showing the evolution of her/his download/upload speeds.
    NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                      client_address: chararray,
                      client_country: chararray,
                      lon: float,
                      lat: float,
                      client_provider: chararray,
                      mlabservername:  chararray,
                      connect_time:    float,
                      download_speed:  float,
                      neubot_version:  float,
                      platform:        chararray,
                      remote_address:  chararray,
                      test_name:       chararray,
                      timestamp:       long,
                      upload_speed:    float,
                      latency:  float,
                      uuid:     chararray,
                      asnum:    chararray,
                      region:   chararray,
                      city:     chararray,
                      hour:     int,
                      month:    int,
                      year:     int,
                      weekday:  int,
                      day:      int,
                      filedate: chararray
    );
    
    --
    -- Determines the user that realized the maximum number of tests and 
    -- obtain his/her download_speed log
    --
    
    Tests = FILTER @ BY (test_name matches '.*speedtest.*');
    
    Tests_In_Barcelona = FILTER @ BY (
        city matches '.*Barcelona.*'
    );
    
    Tests_Per_User = GROUP Tests_In_Barcelona BY uuid;
    
    Tests_Per_User = FOREACH @ GENERATE
        group AS uuid,
        COUNT(Tests_In_Barcelona) AS numberOfTests
    ;
    
    MAX_NUM_TESTS = GROUP @ ALL;
    MAX_NUM_TESTS = FOREACH @ GENERATE
        MAX( Tests_Per_User.numberOfTests) AS numberOfTests
    ;
    
    TOP_1_USER = JOIN
        Tests_Per_User BY numberOfTests,
        MAX_NUM_TESTS  BY numberOfTests
    ;
    
    TOP_1_USER = FOREACH @ GENERATE Tests_Per_User::uuid AS uuid;
    
    TOP_1_USER_TESTS = JOIN
        Tests BY uuid,
        TOP_1_USER BY uuid
    ;
    
    TOP_1_USER_TESTS = FOREACH @ GENERATE
        Tests::uuid AS uuid,
        Tests::city AS city,
        Tests::timestamp AS timestamp,
        Tests::download_speed AS download_speed
    ;
    
    TOP_1_USER_TESTS = ORDER @ BY timestamp;
    
    STORE @ INTO 'Top_1_User' USING PigStorage(',');
    ]]>
    148
    Answers CouchDB http://espinosa-oviedo.com/big-data-visualization-2016-bsc/2016/05/03/answers-couchdb/?utm_source=rss&utm_medium=rss&utm_campaign=answers-couchdb Tue, 03 May 2016 18:50:52 +0000 http://espinosa-oviedo.com/big-data-visualization-2016-bsc/?p=144
  • Define a view in MapReduce that contains, for each theatre, the films presented in it. Hint: You do not need a reduce here.
  • function(doc) {
        if(doc.feed.theaterShowtimes[0]) {
            var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
            var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;
    
            var movies = [];
            for(var i=0; i < moviesOnShow.length; i++) {
                var movie = moviesOnShow[i].onShow.movie;
                movies.push(movie.title);
            } // for
    
            emit(movieTheater.name, movies);
        } // if
    } // func
    
    
    • Modify your previous answer and filter the theaters outside Grenoble (e.g., do not include the theatres in Saint Martin d’Hères).
    function(doc) {
        if(doc.feed.theaterShowtimes[0]) {
            var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
            var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;
    
            var movies = [];
            if(movieTheater.city == "Grenoble") {
                 for(var i=0; i < moviesOnShow.length; i++) {
                     var movie = moviesOnShow[i].onShow.movie;
                     movies.push(movie.title); 
                 } // for
    
                 emit(movieTheater.name, movies);
    
             } // if
         } // if
    } // func
    • Give the number of films that each theatre is presenting. Hint: You need a reduce here.
    // Map
    function(doc) {
        if(doc.feed.theaterShowtimes[0]) {
            var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
            var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;
    
            for(var i=0; i < moviesOnShow.length; i++) {
                emit(movieTheater.name, 1);
            } // for
    
         } // if
    } // func
    
    
    // Reduce
    function (key, values) {
        return sum(values) ;
    }
    • Give the list of films with a press rating higher than 4 stars. Attention: filter duplicates.
    // Map
    function(doc) {
        if(doc.feed.theaterShowtimes[0]) {
            var movies = doc.feed.theaterShowtimes[0].movieShowtimes;
            for(var i=0; i < movies.length; i++) {
                var movie = movies[i].onShow.movie;
                if(movie.statistics.pressRating > 4) {
                    emit([movie.title, movie.statistics.pressRating], null);
                }
             } // for
        } // if
    } // func
    
     
    // Reduce
    function (keys,values) {
        return null ;
    }
    • Give the list of films presented 2 years ago (10.12.2011), and for each film, the theatre where it was presented and its schedule.
    function(doc) {
        if(doc.feed.theaterShowtimes[0]) {
            var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
            var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes; 
    
            for(var i=0; i < moviesOnShow.length; i++) {
                var movie = {
                    "title":   moviesOnShow[i].onShow.movie.title,
                    "theater": movieTheater.name,
                    "date":    moviesOnShow[i].scr[0].d,
                    "schedule": []
                };
    
                if(movie.date == "2011-12-09") {
                    var showTime = moviesOnShow[i].scr[0].t;
                    for(var j=0; j < showTime.length; j++) {
                        movie.schedule.push( showTime[j].$ );
                    }
                    emit(movie.title, movie);
                } // if
            }  // for
        }  // if
    }  // func
    
    
    • BONUS! Give the list of films, and for every film, the list of theatres that present it (this question is a challenge but we encourage you to try to solve it).
    // Map
    function(doc) {
        if(doc.feed.theaterShowtimes[0]) {
            var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
            var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;
            for(var i=0; i < moviesOnShow.length; i++) {
                var movie = moviesOnShow[i].onShow.movie;
                emit(movie.title, movieTheater); 
            } // for
        } // if
    } // func
    
    
    // Reduce
    function(keys, values) {
        var movieTheaters = [] ; 
        for(var i=0; i<values.length; i++) {
            var theater = values[i].name;
            if(!contains(movieTheaters, theater)) {
                movieTheaters.push(theater);
            }
        } // for
        return [ movieTheaters.length, movieTheaters ];
    } // func
     
    function contains(array, element) {
        var isContained = false;
        for(var i=0; i<array.length; i++) {
            if(array[i] == element){
                isContained = true;
                break;
            }
        } // for
        return isContained;
    } // func
    ]]>
    144