Athena

Amazon Athena 筆記。

Intro

  • Athena
    • Is an interactive query service that makes it easy to analyze data in Amazon S3 using standard SQL
    • Is serverless, so there’s no infra to manage, and pay only for the queries that you run
    • Uses Presto to run SQL Queries
    • Is out-of-the-box integrated with AWS Glue Data Catalog

Presto

  • Developed by Facebook, Not Open Source
  • Presto is a distributed system that runs on Hadoop, and uses an architecture similar to a classic massively parallel processing (MPP) database management system
  • It has one coordinator node working in sync with multiple worker nodes

Glue Data Catalog

Meta Data includes:

  • Data Location
  • Schema
  • Data Types
  • Data Classification

Persistent Metadata Store

  • It’s a managed service that lets you store, annotate, and share metadata which can be used to query and transform data
  • One AWS Glue Data Catalog per AWS Region
  • Can be used for data governance

Athena SQL Query

Create Database

create database demo_data;

Create Table

CREATE EXTERNAL TABLE IF NOT EXISTS customers
(
customerid bigint,
firstname string,
lastname string,
fullname string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 's3://your-bucket/data/customers/';
CREATE EXTERNAL TABLE IF NOT EXISTS employees(
employeeid  bigint,
managerid bigint,
firstname string,
lastname string ,
fullname  string,
jobtitle string,
organizationlevel int,
maritalstatus string,
gender string,
territory string,
country string,
group string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 's3://your-bucket/data/employees/';
CREATE EXTERNAL TABLE IF NOT EXISTS orders(
salesorderid bigint,
salesorderdetailid int,
orderdate string,
duedate string,
shipdate string,
employeeid bigint,
customerid bigint,
subtotal decimal(17,4),
taxamt decimal(17,4),
freight decimal(17,4),
totaldue decimal(17,4),
productid  int,
orderqty int,
unitprice decimal(17,4),
unitpricediscount decimal(17,4),
linetotal decimal(17,4)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 's3://your-bucket/data/orders/';

Query Data

SELECT * FROM customers;
SELECT firstname, lastname from customers limit 5;

Concat

SELECT concat(firstname,' ',lastname) as full_name from customers limit 5;

WHERE

SELECT * FROM customers WHERE firstname = 'John' limit 5;

OR / AND

SELECT * FROM customers WHERE firstname = 'John' AND lastname = 'Arthur' limit 5;
SELECT * FROM customers WHERE firstname = 'John' OR lastname = 'Arthur' limit 5;

IN

SELECT * FROM customers WHERE customerid in (371, 377);

WILD CARDS

SELECT * FROM customers WHERE fullname like 'J%' limit 5;

UNION

SELECT * FROM customers WHERE customerid = 371
UNION
SELECT * FROM customers WHERE customerid in (371, 377);

INSERT DATA

INSERT INTO customers (customerid, firstname, lastname, fullname)
values (1221, 'Lex', 'Luthor', 'Lex Luthor');

DISTINCT

SELECT DISTINCT firstname FROM customers WHERE fullname like 'J%';

COUNT

SELECT COUNT(distinct firstname) FROM customers WHERE firstname like 'J%';

GROUP BY

SELECT firstname, COUNT(firstname) as countJFirstnames
FROM customers
WHERE firstname like 'J%'
GROUP BY firstname;

Nested Query

SELECT * FROM customers
WHERE customerid IN (SELECT customerid FROM customers);

Common Table Expression

WITH cte as
(
SELECT firstname as first_name, lastname as last_name
FROM customers
)
SELECT *
FROM cte;