- Filter the speedtest conducted in Barcelona or Madrid. Then list the internet providers working in those cities.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
client_address: chararray,
client_country: chararray,
lon: float,
lat: float,
client_provider: chararray,
mlabservername: chararray,
connect_time: float,
download_speed: float,
neubot_version: float,
platform: chararray,
remote_address: chararray,
test_name: chararray,
timestamp: long,
upload_speed: float,
latency: float,
uuid: chararray,
asnum: chararray,
region: chararray,
city: chararray,
hour: int,
month: int,
year: int,
weekday: int,
day: int,
filedate: chararray
);
--
-- A1: Internet Providers in 'Barcelona' or 'Madrid' where speedtests were conducted
--
SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
SpeedTests = FILTER @ BY (
city matches '.*Barcelona.*' OR
city matches '.*Madrid.*'
);
Providers = FOREACH @ GENERATE city, client_provider;
Providers = DISTINCT @;
DUMP @;
- List the names and the IP ranges of the internet providers located in Barcelona. For this you need to use the IPtoNumber user defined function (cf. NeubotTestsUDFs.jar).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
client_address: chararray,
client_country: chararray,
lon: float,
lat: float,
client_provider: chararray,
mlabservername: chararray,
connect_time: float,
download_speed: float,
neubot_version: float,
platform: chararray,
remote_address: chararray,
test_name: chararray,
timestamp: long,
upload_speed: float,
latency: float,
uuid: chararray,
asnum: chararray,
region: chararray,
city: chararray,
hour: int,
month: int,
year: int,
weekday: int,
day: int,
filedate: chararray
);
--
-- A2: Internet Providers in Barcelona and their IP range based on the speedtests observations
--
SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
SpeedTests = FILTER @ BY (
city matches '.*Barcelona.*'
);
Providers = FOREACH @ GENERATE
city,
client_provider,
IPtoNumber(client_address) AS ip
;
Providers = GROUP @ BY client_provider;
Providers_IP_Range = FOREACH @ GENERATE
group,
NumberToIP( MIN(Providers.ip) ),
NumberToIP( MAX(Providers.ip) )
;
DUMP @;
- Group the speedtest based on the user network infrastructure (e.g., 3G/4G vs ADSL). For this you can assume some max bandwidth (e.g., 21Mb/sec for ADSL).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
client_address: chararray,
client_country: chararray,
lon: float,
lat: float,
client_provider: chararray,
mlabservername: chararray,
connect_time: float,
download_speed: float,
neubot_version: float,
platform: chararray,
remote_address: chararray,
test_name: chararray,
timestamp: long,
upload_speed: float,
latency: float,
uuid: chararray,
asnum: chararray,
region: chararray,
city: chararray,
hour: int,
month: int,
year: int,
weekday: int,
day: int,
filedate: chararray
);
--
-- A3: Speedtests (conducted in Barcelona) organized by network type: Mobile vs ADSL
--
SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
SpeedTests = FILTER @ BY (
city matches '.*Barcelona.*'
);
SPLIT @ INTO
Mobile_Tests IF (
download_speed > 21000000 -- 21 Mb / sec
),
ADSL_Tests IF (
download_speed <= 21000000 -- 21 Mb / sec
)
;
MobileSpeeds = FOREACH Mobile_Tests GENERATE
CEIL(download_speed / 1000000) AS download_speed,
'mobile' AS network_type: chararray
;
ADSLSpeeds = FOREACH ADSL_Tests GENERATE
CEIL(download_speed / 1000000) AS download_speed,
'adsl' AS network_type: chararray
;
Speeds = UNION MobileSpeeds, ADSLSpeeds;
Speeds = GROUP @ BY (download_speed, network_type);
Speeds = FOREACH @ GENERATE
CONCAT( (chararray) group.download_speed, ' mb/sec' ),
group.network_type,
COUNT(Speeds)
;
DUMP @;
- Find the user that realized the maximum number of tests. For this user, produce a table showing the evolution of her/his download/upload speeds.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
client_address: chararray,
client_country: chararray,
lon: float,
lat: float,
client_provider: chararray,
mlabservername: chararray,
connect_time: float,
download_speed: float,
neubot_version: float,
platform: chararray,
remote_address: chararray,
test_name: chararray,
timestamp: long,
upload_speed: float,
latency: float,
uuid: chararray,
asnum: chararray,
region: chararray,
city: chararray,
hour: int,
month: int,
year: int,
weekday: int,
day: int,
filedate: chararray
);
--
-- Determines the user that realized the maximum number of tests and
-- obtain his/her download_speed log
--
Tests = FILTER @ BY (test_name matches '.*speedtest.*');
Tests_In_Barcelona = FILTER @ BY (
city matches '.*Barcelona.*'
);
Tests_Per_User = GROUP Tests_In_Barcelona BY uuid;
Tests_Per_User = FOREACH @ GENERATE
group AS uuid,
COUNT(Tests_In_Barcelona) AS numberOfTests
;
MAX_NUM_TESTS = GROUP @ ALL;
MAX_NUM_TESTS = FOREACH @ GENERATE
MAX( Tests_Per_User.numberOfTests) AS numberOfTests
;
TOP_1_USER = JOIN
Tests_Per_User BY numberOfTests,
MAX_NUM_TESTS BY numberOfTests
;
TOP_1_USER = FOREACH @ GENERATE Tests_Per_User::uuid AS uuid;
TOP_1_USER_TESTS = JOIN
Tests BY uuid,
TOP_1_USER BY uuid
;
TOP_1_USER_TESTS = FOREACH @ GENERATE
Tests::uuid AS uuid,
Tests::city AS city,
Tests::timestamp AS timestamp,
Tests::download_speed AS download_speed
;
TOP_1_USER_TESTS = ORDER @ BY timestamp;
STORE @ INTO 'Top_1_User' USING PigStorage(',');