IN5800 – Saturation

Leif Harald Karlsen

Definition: Saturation

Saturation: A data transformation within a format and structure that only adds data derivable (according to some semantics) or otherwise computable from the source data

Sometimes also called derivation, however we have used this only with respect to reasoning and semantics.

Types of Saturations

Reasons for Saturation

# Find all ancestors of Bob living in Oslo.
SELECT ?ancestor_name
WHERE {
  ex:bob (ex-o:hasFather|ex-o:hasMother)+ ?ancestor .
  
  ?ancestor ex-o:hasName ?ancestor_name ;
            ex-o:address ?address .
            
  FILTER REGEX(?address, "^.*Oslo$")
}

# versus

SELECT ?ancestor_name
WHERE {
  ex:bob ex-o:hasAncestor [ ex-o:hasName ?ancestor_name ;
                            ex-o:addressCity "Oslo" ] .
}

Running Example: Data

Running Example: Problem

Saturation via Value Extraction

Running Example: Value Extraction

 16 - 2 kg boxes
 18 - 500 g pkgs.
 10 boxes x 12 pieces
 12 - 200 ml jars
 15 - 625 g jars
 24 pkgs. x 4 pieces
 16 kg pkg.
 10 boxes x 30 bags
 12 - 100 g bars
 20 - 450 g glasses
 24 - 500 ml bottles
 24 - 250 g pkgs.
 500 ml
 36 boxes
 ...
CREATE VIEW packaging(product_id, packaging) AS
WITH
  words AS ( -- Splits each packaging word into a separate row, with the product's id
    SELECT
      product_id,
      regexp_split_to_table(btrim(regexp_replace(quantity_per_unit, '(\d+)|(-)|(\sx\s)', ' ', 'g'), ' '), '\s+') AS word
    FROM products
  )
SELECT
  product_id,
  word AS packaging
FROM words
WHERE word NOT IN (SELECT unit FROM units) -- Drop units (these are contained in the utility table units)
      AND length(word) > 1;                -- Drop single letters

Saturation via Value Translation

Running Example: Value Translation

Need to put products into price categories (expensive, midrange, cheap) (make shipping_expensiveness):

CREATE VIEW expensiveness(product_id, expensiveness) AS
SELECT product_id,
       CASE WHEN unit_price > 50 THEN 'expensive'
            WHEN unit_price > 10 THEN 'midrange'
        ELSE 'cheap'
       END
FROM products;

Saturation via Computing Attributes/Relationships

Running Example: Computing Relationships

Need to know whether countries are neighboring (make shipping_spatial):

CREATE VIEW neighbors(country, neighbor) AS
SELECT c1.country_name, c2.country_name
FROM countries AS c1
     JOIN countries AS c2 ON (st_intersects(c1.geom, c2.geom)); -- Spatial intersection
    
CREATE VIEW not_neighbors(country1, country2) AS
SELECT c1.country_name, c2.country_name
FROM countries AS c1, countries AS c2
WHERE (c1.country_name, c2.country_name) NOT IN (SELECT country, neighbor FROM neighbors);

Also want to know from where to where each order is going and other attributes (simple denormalization):

CREATE VIEW shipment(order_id, product_id, from_country, to_country, expensiveness) AS
SELECT o.order_id,
       o.product_id,
       s.country AS from_country,
       c.country AS to_country,
       ex.expensiveness
FROM orders AS o
     JOIN customers AS c USING (customer_id)
     JOIN products AS p USING (product_id)
     JOIN suppliers AS s USING (supplier_id)
     JOIN expensiveness AS ex USING (product_id);

Saturation via Aggregation/Object Creation

CREATE VIEW street(name, num_houses, num_people) AS
WITH
  streets AS (
    SELECT DISTINCT btrim(regexp_replace(home_address, '\d+', '')) AS name
    FROM suburbia.home
  ),
  num_houses AS (
    SELECT s.name, count(home_address) AS num_houses
    FROM streets AS s, suburbia.home AS h
    WHERE h.home_address LIKE '%' || s.name || '%'
    GROUP BY s.name
  ),
  num_people AS (
    SELECT s.name, count(DISTINCT cid) AS num_people
    FROM streets AS s, suburbia.home AS h, suburbia.citizen AS c
    WHERE h.home_address LIKE '%' || s.name || '%' AND c.home = h.hid
    GROUP BY s.name
  )
SELECT s.name, nh.num_houses, np.num_people
FROM streets AS s JOIN num_houses AS nh USING (name)
     JOIN num_people AS np USING (name);

Running Example: Aggregation/Object Creation

Need to know in which countries we have warehouses (make shipping_warehouses):

CREATE VIEW warehouse(warehouse_id, country) AS
SELECT row_number() OVER (ORDER BY country) AS warehouse_id,
       country
FROM (
    SELECT country
    FROM customers
    GROUP BY country
    HAVING count(*) > 4  -- More than 4 customers
    UNION
    SELECT country_name
    FROM countries
    WHERE st_area(geom) > 50  -- Large area
) AS t;

Saturation via Semantics/Rules

Running Example: Semantics/Rules

Lets add “same packaging” and finally, which orders can be shipped with which, as Lore-rules (make shipping_rules):

CREATE RELATION package_type(product_id smallint, ptype text);

package_type(pid, 'boxed') <- packaging(pid, pkg)
    : pkg IN ('boxes', 'box', 'pkg', 'pkgs', 'pkgs.', 'bars', 'rounds', 'pies');

package_type(pid, 'bottled') <- packaging(pid, pkg)
    : pkg IN ('glasses', 'bottles', 'bottle', 'jars');

package_type(pid, 'canned') <- packaging(pid, pkg)
    : pkg IN ('tins', 'cans');
CREATE RELATION ship_with(oid1 int, oid2 int);

ship_with(oid1, oid2) <-
    shipment(oid1, pid1, ship_from, ship_to, ex),     -- Shipping from and to the same
    shipment(oid2, pid2, ship_from, ship_to, ex),     -- and same expensiveness
    package_type(pid1, pt), package_type(pid2, pt),   -- and same packaging,
    warehouse(w1, ship_from), warehouse(w2, ship_to), -- from and to has warehouses,
    not_neighbors(ship_to, ship_from);                -- from and to are not neighbors
-- Finally make view with all groups of orders that can be shipped together
-- Note: ship_with-relation is reflexive, symmetric, transitive, i.e. an equivalence-relation
-- (thus partitions the orders into equivalence classes that can be shipped together)
CREATE VIEW shipping_group(sgroup) AS
SELECT *
FROM ( -- Aggregate orders that can be shipped together into arrays
  SELECT DISTINCT array_agg(DISTINCT oid2 ORDER BY oid2) AS sgroup
  FROM ship_with
  GROUP BY oid1 -- oid1 will also be in the array, as ship_with is reflexive
) AS t
WHERE array_length(sgroup, 1) > 1 -- Drop single-element groups
ORDER BY array_length(sgroup, 1) DESC;

Running Example: Semantics/OWL

# Introduce each packaging type as a class
ex:Cheap a owl:Class ; rdfs:subClassOf ex:Product .
ex:Midrange a owl:Class ; rdfs:subClassOf ex:Product .
ex:Expensive a owl:Class ; rdfs:subClassOf ex:Product .

# State that every product is exactly one of these types
[] owl:allDisjoint ( ex:Cheap ex:Midrange ex:Expensive ) .
ex:Product rdfs:subClassOf ex:Cheap or ex:Midrange or ex:Expensive .

# Define the classes in terms of the price
ex:price some (integer < 10) rdfs:subClassOf ex:Cheap .
ex:price some (integer >= 10 and integer < 50) rdfs:subClassOf ex:Midrange .
ex:price some (integer >= 50) rdfs:subClassOf ex:Expensive .


# Introduce each packaging type as a class
ex:Boxed a owl:Class ; rdfs:subClassOf ex:Order .
ex:Bottled a owl:Class ; rdfs:subClassOf ex:Order .
ex:Canned a owl:Class ; rdfs:subClassOf ex:Order .

# State that every order is exactly one of these types
[] owl:allDisjoint ( ex:Boxed ex:Bottled ex:Canned ) .
ex:Product rdfs:subClassOf ex:Bottled or ex:Bottled or ex:Canned .

# Define the classes in terms of the packaging word
ex:packaging some {"boxes", "box", "pkg.", "pkgs."} rdfs:subClassOf ex:Boxed .
ex:packaging some {"glasses", "bottles", "bottle", "jars"} rdfs:subClassOf ex:Bottled .
ex:packaging some {"tins", "cans"} rdfs:subClassOf ex:Canned .

Saturation via Interpolation/Extrapolation

-- Given hourly temperature forecasts:
CREATE TABLE temp_1h(lat float, lon float, time timestamp, temp float);
INSERT INTO temp_1h VALUES
(59.1, 10.2, '2022-03-22 12:00:00', 8.0),
(59.1, 10.2, '2022-03-22 13:00:00', 10.3),
(59.1, 10.2, '2022-03-22 14:00:00', 11.1),
(59.1, 10.2, '2022-03-22 15:00:00', 12.5),
(60.2, 10.5, '2022-03-22 12:00:00', 2.1),
(60.2, 10.5, '2022-03-22 13:00:00', 5.2),
(60.2, 10.5, '2022-03-22 14:00:00', 6.1),
(60.2, 10.5, '2022-03-22 15:00:00', 8.3);

CREATE VIEW temp_30m(lat, lon, time, temp) AS
WITH
  temps(lat, lon, time, temp) AS (
    SELECT t1.lat, t1.lon, t1.time + '30 min'::interval, (t1.temp + t2.temp)/2
    FROM temp_1h AS t1 JOIN temp_1h AS t2
         ON (t1.lat = t2.lat AND t1.lon = t2.lon AND t2.time = (t1.time + '1 hour'::interval))
    UNION ALL
    SELECT *
    FROM temp_1h
  )
SELECT * FROM temps
ORDER BY lat, lon, time;

Bad Saturation