Merge branch 'feat/clickhouse-mv' into dev

This commit is contained in:
Francis Cao 2024-07-22 22:13:56 -07:00
commit 9957080f36
11 changed files with 302 additions and 66 deletions

View File

@ -67,3 +67,202 @@ CREATE TABLE umami.session_data
engine = MergeTree engine = MergeTree
ORDER BY (website_id, session_id, data_key, created_at) ORDER BY (website_id, session_id, data_key, created_at)
SETTINGS index_granularity = 8192; SETTINGS index_granularity = 8192;
-- stats hourly
CREATE TABLE umami.website_event_stats_hourly
(
website_id UUID,
session_id UUID,
visit_id UUID,
hostname LowCardinality(String),
browser LowCardinality(String),
os LowCardinality(String),
device LowCardinality(String),
country LowCardinality(String),
subdivision1 LowCardinality(String),
city String,
entry_url AggregateFunction(argMin, String, DateTime('UTC')),
exit_url AggregateFunction(argMax, String, DateTime('UTC')),
url_path SimpleAggregateFunction(groupArrayArray, Array(String)),
url_query SimpleAggregateFunction(groupArrayArray, Array(String)),
referrer_domain SimpleAggregateFunction(groupArrayArray, Array(String)),
page_title SimpleAggregateFunction(groupArrayArray, Array(String)),
event_type UInt32,
event_name SimpleAggregateFunction(groupArrayArray, Array(String)),
views SimpleAggregateFunction(sum, UInt64),
min_time SimpleAggregateFunction(min, DateTime('UTC')),
max_time SimpleAggregateFunction(max, DateTime('UTC')),
created_at Datetime('UTC')
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMM(created_at)
ORDER BY (
website_id,
toStartOfHour(created_at),
cityHash64(visit_id),
visit_id
)
SAMPLE BY cityHash64(visit_id)
TTL created_at + INTERVAL 10 DAY;
CREATE MATERIALIZED VIEW umami.website_event_stats_hourly_mv
TO umami.website_event_stats_hourly
AS
SELECT
website_id,
session_id,
visit_id,
hostname,
browser,
os,
device,
country,
subdivision1,
city,
entry_url,
exit_url,
url_paths as url_path,
url_query,
referrer_domain,
page_title,
event_type,
event_name,
views,
min_time,
max_time,
timestamp as created_at
FROM (SELECT
website_id,
session_id,
visit_id,
hostname,
browser,
os,
device,
country,
subdivision1,
city,
argMinState(url_path, created_at) entry_url,
argMaxState(url_path, created_at) exit_url,
arrayFilter(x -> x != '', groupArray(url_path)) as url_paths,
arrayFilter(x -> x != '', groupArray(url_query)) url_query,
arrayFilter(x -> x != '', groupArray(referrer_domain)) referrer_domain,
arrayFilter(x -> x != '', groupArray(page_title)) page_title,
event_type,
if(event_type = 2, groupArray(event_name), []) event_name,
sumIf(1, event_type = 1) views,
min(created_at) min_time,
max(created_at) max_time,
toStartOfHour(created_at) timestamp
FROM umami.website_event
GROUP BY website_id,
session_id,
visit_id,
hostname,
browser,
os,
device,
country,
subdivision1,
city,
event_type,
timestamp);
-- stats daily
CREATE TABLE umami.website_event_stats_daily
(
website_id UUID,
session_id UUID,
visit_id UUID,
hostname LowCardinality(String),
browser LowCardinality(String),
os LowCardinality(String),
device LowCardinality(String),
country LowCardinality(String),
subdivision1 LowCardinality(String),
city String,
entry_url AggregateFunction(argMin, String, DateTime('UTC')),
exit_url AggregateFunction(argMax, String, DateTime('UTC')),
url_path SimpleAggregateFunction(groupArrayArray, Array(String)),
url_query SimpleAggregateFunction(groupArrayArray, Array(String)),
referrer_domain SimpleAggregateFunction(groupArrayArray, Array(String)),
page_title SimpleAggregateFunction(groupArrayArray, Array(String)),
event_type UInt32,
event_name SimpleAggregateFunction(groupArrayArray, Array(String)),
views SimpleAggregateFunction(sum, UInt64),
min_time SimpleAggregateFunction(min, DateTime('UTC')),
max_time SimpleAggregateFunction(max, DateTime('UTC')),
created_at Datetime('UTC')
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMM(created_at)
ORDER BY (
website_id,
toStartOfDay(created_at),
cityHash64(visit_id),
visit_id
)
SAMPLE BY cityHash64(visit_id);
CREATE MATERIALIZED VIEW umami.website_event_stats_daily_mv
TO umami.website_event_stats_daily
AS
SELECT
website_id,
session_id,
visit_id,
hostname,
browser,
os,
device,
country,
subdivision1,
city,
entry_url,
exit_url,
url_paths as url_path,
url_query,
referrer_domain,
page_title,
event_type,
event_name,
views,
min_time,
max_time,
timestamp as created_at
FROM (SELECT
website_id,
session_id,
visit_id,
hostname,
browser,
os,
device,
country,
subdivision1,
city,
argMinState(url_path, created_at) entry_url,
argMaxState(url_path, created_at) exit_url,
arrayFilter(x -> x != '', groupArray(url_path)) as url_paths,
arrayFilter(x -> x != '', groupArray(url_query)) url_query,
arrayFilter(x -> x != '', groupArray(referrer_domain)) referrer_domain,
arrayFilter(x -> x != '', groupArray(page_title)) page_title,
event_type,
if(event_type = 2, groupArray(event_name), []) event_name,
sumIf(1, event_type = 1) views,
min(created_at) min_time,
max(created_at) max_time,
toStartOfDay(created_at) timestamp
FROM umami.website_event
GROUP BY website_id,
session_id,
visit_id,
hostname,
browser,
os,
device,
country,
subdivision1,
city,
event_type,
timestamp);

View File

@ -2,6 +2,6 @@ require('dotenv').config();
const cli = require('next/dist/cli/next-start'); const cli = require('next/dist/cli/next-start');
cli.nextStart({ cli.nextStart({
'port': process.env.PORT || 3000, port: process.env.PORT || 3000,
'hostname': process.env.HOSTNAME || '0.0.0.0' hostname: process.env.HOSTNAME || '0.0.0.0',
}); });

View File

@ -58,8 +58,8 @@ export const SESSION_COLUMNS = [
export const FILTER_COLUMNS = { export const FILTER_COLUMNS = {
url: 'url_path', url: 'url_path',
entry: 'url_path', entry: 'entry_url',
exit: 'url_path', exit: 'exit_url',
referrer: 'referrer_domain', referrer: 'referrer_domain',
host: 'hostname', host: 'hostname',
title: 'page_title', title: 'page_title',

View File

@ -64,7 +64,7 @@ export default async (
await useAuth(req, res); await useAuth(req, res);
await useValidate(schema, req, res); await useValidate(schema, req, res);
const { websiteId, type, limit, offset, search } = req.query; const { websiteId, type, limit, offset, search, unit } = req.query;
if (req.method === 'GET') { if (req.method === 'GET') {
if (!(await canViewWebsite(req.auth, websiteId))) { if (!(await canViewWebsite(req.auth, websiteId))) {
@ -89,7 +89,7 @@ export default async (
} }
if (SESSION_COLUMNS.includes(type)) { if (SESSION_COLUMNS.includes(type)) {
const data = await getSessionMetrics(websiteId, type, filters, limit, offset); const data = await getSessionMetrics(websiteId, type, filters, limit, offset, unit as string);
if (type === 'language') { if (type === 'language') {
const combined = {}; const combined = {};
@ -111,7 +111,14 @@ export default async (
} }
if (EVENT_COLUMNS.includes(type)) { if (EVENT_COLUMNS.includes(type)) {
const data = await getPageviewMetrics(websiteId, type, filters, limit, offset); const data = await getPageviewMetrics(
websiteId,
type,
filters,
limit,
offset,
unit as string,
);
return ok(res, data); return ok(res, data);
} }

View File

@ -56,7 +56,7 @@ export default async (
await useAuth(req, res); await useAuth(req, res);
await useValidate(schema, req, res); await useValidate(schema, req, res);
const { websiteId, compare } = req.query; const { websiteId, compare, unit } = req.query;
if (req.method === 'GET') { if (req.method === 'GET') {
if (!(await canViewWebsite(req.auth, websiteId))) { if (!(await canViewWebsite(req.auth, websiteId))) {
@ -72,9 +72,13 @@ export default async (
const filters = getRequestFilters(req); const filters = getRequestFilters(req);
const metrics = await getWebsiteStats(websiteId, { ...filters, startDate, endDate }); const metrics = await getWebsiteStats(websiteId, unit as string, {
...filters,
startDate,
endDate,
});
const prevPeriod = await getWebsiteStats(websiteId, { const prevPeriod = await getWebsiteStats(websiteId, unit as string, {
...filters, ...filters,
startDate: compareStartDate, startDate: compareStartDate,
endDate: compareEndDate, endDate: compareEndDate,

View File

@ -51,17 +51,23 @@ async function clickhouseQuery(
eventType: EVENT_TYPE.customEvent, eventType: EVENT_TYPE.customEvent,
}); });
const table = unit === 'hour' ? 'website_event_stats_hourly' : 'website_event_stats_daily';
return rawQuery( return rawQuery(
` `
select select
event_name x, event_name x,
${getDateSQL('created_at', unit, timezone)} t, ${getDateSQL('created_at', unit, timezone)} t,
count(*) y count(*) y
from website_event from (
select arrayJoin(event_name) as event_name,
created_at
from ${table} website_event
where website_id = {websiteId:UUID} where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32} and event_type = {eventType:UInt32}
${filterQuery} ${filterQuery}
) as g
group by x, t group by x, t
order by t order by t
`, `,

View File

@ -1,3 +1,4 @@
/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars */
import clickhouse from 'lib/clickhouse'; import clickhouse from 'lib/clickhouse';
import { EVENT_TYPE } from 'lib/constants'; import { EVENT_TYPE } from 'lib/constants';
import { CLICKHOUSE, PRISMA, runQuery } from 'lib/db'; import { CLICKHOUSE, PRISMA, runQuery } from 'lib/db';
@ -5,7 +6,7 @@ import prisma from 'lib/prisma';
import { QueryFilters } from 'lib/types'; import { QueryFilters } from 'lib/types';
export async function getWebsiteStats( export async function getWebsiteStats(
...args: [websiteId: string, filters: QueryFilters] ...args: [websiteId: string, unit: string, filters: QueryFilters]
): Promise< ): Promise<
{ pageviews: number; visitors: number; visits: number; bounces: number; totaltime: number }[] { pageviews: number; visitors: number; visits: number; bounces: number; totaltime: number }[]
> { > {
@ -17,6 +18,7 @@ export async function getWebsiteStats(
async function relationalQuery( async function relationalQuery(
websiteId: string, websiteId: string,
unit: string,
filters: QueryFilters, filters: QueryFilters,
): Promise< ): Promise<
{ pageviews: number; visitors: number; visits: number; bounces: number; totaltime: number }[] { pageviews: number; visitors: number; visits: number; bounces: number; totaltime: number }[]
@ -57,6 +59,7 @@ async function relationalQuery(
async function clickhouseQuery( async function clickhouseQuery(
websiteId: string, websiteId: string,
unit: string,
filters: QueryFilters, filters: QueryFilters,
): Promise< ): Promise<
{ pageviews: number; visitors: number; visits: number; bounces: number; totaltime: number }[] { pageviews: number; visitors: number; visits: number; bounces: number; totaltime: number }[]
@ -66,29 +69,21 @@ async function clickhouseQuery(
...filters, ...filters,
eventType: EVENT_TYPE.pageView, eventType: EVENT_TYPE.pageView,
}); });
const table = unit === 'hour' ? 'website_event_stats_hourly' : 'website_event_stats_daily';
return rawQuery( return rawQuery(
` `
select select
sum(t.c) as "pageviews", sum(views) as "pageviews",
uniq(t.session_id) as "visitors", uniq(session_id) as "visitors",
uniq(t.visit_id) as "visits", uniq(visit_id) as "visits",
sum(if(t.c = 1, 1, 0)) as "bounces", sumIf(1, views = 1) as "bounces",
sum(max_time-min_time) as "totaltime" sum(max_time-min_time) as "totaltime"
from ( from ${table} "website_event"
select
session_id,
visit_id,
count(*) c,
min(created_at) min_time,
max(created_at) max_time
from website_event
where website_id = {websiteId:UUID} where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32} and event_type = {eventType:UInt32}
${filterQuery} ${filterQuery};
group by session_id, visit_id
) as t;
`, `,
params, params,
).then(result => { ).then(result => {

View File

@ -1,3 +1,4 @@
/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars */
import clickhouse from 'lib/clickhouse'; import clickhouse from 'lib/clickhouse';
import { EVENT_TYPE, FILTER_COLUMNS, SESSION_COLUMNS } from 'lib/constants'; import { EVENT_TYPE, FILTER_COLUMNS, SESSION_COLUMNS } from 'lib/constants';
import { CLICKHOUSE, PRISMA, runQuery } from 'lib/db'; import { CLICKHOUSE, PRISMA, runQuery } from 'lib/db';
@ -5,7 +6,14 @@ import prisma from 'lib/prisma';
import { QueryFilters } from 'lib/types'; import { QueryFilters } from 'lib/types';
export async function getPageviewMetrics( export async function getPageviewMetrics(
...args: [websiteId: string, type: string, filters: QueryFilters, limit?: number, offset?: number] ...args: [
websiteId: string,
type: string,
filters: QueryFilters,
limit?: number,
offset?: number,
unit?: string,
]
) { ) {
return runQuery({ return runQuery({
[PRISMA]: () => relationalQuery(...args), [PRISMA]: () => relationalQuery(...args),
@ -19,6 +27,7 @@ async function relationalQuery(
filters: QueryFilters, filters: QueryFilters,
limit: number = 500, limit: number = 500,
offset: number = 0, offset: number = 0,
unit: string,
) { ) {
const column = FILTER_COLUMNS[type] || type; const column = FILTER_COLUMNS[type] || type;
const { rawQuery, parseFilters } = prisma; const { rawQuery, parseFilters } = prisma;
@ -82,6 +91,7 @@ async function clickhouseQuery(
filters: QueryFilters, filters: QueryFilters,
limit: number = 500, limit: number = 500,
offset: number = 0, offset: number = 0,
unit: string,
): Promise<{ x: string; y: number }[]> { ): Promise<{ x: string; y: number }[]> {
const column = FILTER_COLUMNS[type] || type; const column = FILTER_COLUMNS[type] || type;
const { rawQuery, parseFilters } = clickhouse; const { rawQuery, parseFilters } = clickhouse;
@ -90,40 +100,42 @@ async function clickhouseQuery(
eventType: column === 'event_name' ? EVENT_TYPE.customEvent : EVENT_TYPE.pageView, eventType: column === 'event_name' ? EVENT_TYPE.customEvent : EVENT_TYPE.pageView,
}); });
let entryExitQuery = '';
let excludeDomain = ''; let excludeDomain = '';
let groupByQuery = '';
if (column === 'referrer_domain') { if (column === 'referrer_domain') {
excludeDomain = `and referrer_domain != {websiteDomain:String} and referrer_domain != ''`; excludeDomain = `and t != {websiteDomain:String} and t != ''`;
}
let columnQuery = `arrayJoin(${column})`;
if (type === 'entry') {
columnQuery = `visit_id x, argMinMerge(${column})`;
}
if (type === 'exit') {
columnQuery = `visit_id x, argMaxMerge(${column})`;
} }
if (type === 'entry' || type === 'exit') { if (type === 'entry' || type === 'exit') {
const aggregrate = type === 'entry' ? 'min' : 'max'; groupByQuery = 'group by x';
entryExitQuery = `
join (
select visit_id,
${aggregrate}(created_at) target_created_at
from website_event
where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32}
group by visit_id
) x
on x.visit_id = website_event.visit_id
and x.target_created_at = website_event.created_at
`;
} }
const table = unit === 'hour' ? 'website_event_stats_hourly' : 'website_event_stats_daily';
return rawQuery( return rawQuery(
` `
select ${column} x, count(*) y select g.t as x,
from website_event count(*) as y
${entryExitQuery} from (
select ${columnQuery} as t
from ${table} website_event
where website_id = {websiteId:UUID} where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32} and event_type = {eventType:UInt32}
${excludeDomain} ${excludeDomain}
${filterQuery} ${filterQuery}
${groupByQuery}) as g
group by x group by x
order by y desc order by y desc
limit ${limit} limit ${limit}

View File

@ -46,6 +46,7 @@ async function clickhouseQuery(
...filters, ...filters,
eventType: EVENT_TYPE.pageView, eventType: EVENT_TYPE.pageView,
}); });
const table = unit === 'hour' ? 'website_event_stats_hourly' : 'website_event_stats_daily';
return rawQuery( return rawQuery(
` `
@ -55,8 +56,8 @@ async function clickhouseQuery(
from ( from (
select select
${getDateSQL('created_at', unit, timezone)} as t, ${getDateSQL('created_at', unit, timezone)} as t,
count(*) as y sum(views) as y
from website_event from ${table} website_event
where website_id = {websiteId:UUID} where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32} and event_type = {eventType:UInt32}

View File

@ -1,3 +1,4 @@
/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars */
import prisma from 'lib/prisma'; import prisma from 'lib/prisma';
import clickhouse from 'lib/clickhouse'; import clickhouse from 'lib/clickhouse';
import { runQuery, CLICKHOUSE, PRISMA } from 'lib/db'; import { runQuery, CLICKHOUSE, PRISMA } from 'lib/db';
@ -5,7 +6,14 @@ import { EVENT_TYPE, FILTER_COLUMNS, SESSION_COLUMNS } from 'lib/constants';
import { QueryFilters } from 'lib/types'; import { QueryFilters } from 'lib/types';
export async function getSessionMetrics( export async function getSessionMetrics(
...args: [websiteId: string, type: string, filters: QueryFilters, limit?: number, offset?: number] ...args: [
websiteId: string,
type: string,
filters: QueryFilters,
limit?: number,
offset?: number,
unit?: string,
]
) { ) {
return runQuery({ return runQuery({
[PRISMA]: () => relationalQuery(...args), [PRISMA]: () => relationalQuery(...args),
@ -19,6 +27,7 @@ async function relationalQuery(
filters: QueryFilters, filters: QueryFilters,
limit: number = 500, limit: number = 500,
offset: number = 0, offset: number = 0,
unit: string,
) { ) {
const column = FILTER_COLUMNS[type] || type; const column = FILTER_COLUMNS[type] || type;
const { parseFilters, rawQuery } = prisma; const { parseFilters, rawQuery } = prisma;
@ -62,6 +71,7 @@ async function clickhouseQuery(
filters: QueryFilters, filters: QueryFilters,
limit: number = 500, limit: number = 500,
offset: number = 0, offset: number = 0,
unit: string,
): Promise<{ x: string; y: number }[]> { ): Promise<{ x: string; y: number }[]> {
const column = FILTER_COLUMNS[type] || type; const column = FILTER_COLUMNS[type] || type;
const { parseFilters, rawQuery } = clickhouse; const { parseFilters, rawQuery } = clickhouse;
@ -70,6 +80,7 @@ async function clickhouseQuery(
eventType: EVENT_TYPE.pageView, eventType: EVENT_TYPE.pageView,
}); });
const includeCountry = column === 'city' || column === 'subdivision1'; const includeCountry = column === 'city' || column === 'subdivision1';
const table = unit === 'hour' ? 'website_event_stats_hourly' : 'website_event_stats_daily';
return rawQuery( return rawQuery(
` `
@ -77,7 +88,7 @@ async function clickhouseQuery(
${column} x, ${column} x,
uniq(session_id) y uniq(session_id) y
${includeCountry ? ', country' : ''} ${includeCountry ? ', country' : ''}
from website_event from ${table} website_event
where website_id = {websiteId:UUID} where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32} and event_type = {eventType:UInt32}

View File

@ -46,6 +46,7 @@ async function clickhouseQuery(
...filters, ...filters,
eventType: EVENT_TYPE.pageView, eventType: EVENT_TYPE.pageView,
}); });
const table = unit === 'hour' ? 'website_event_stats_hourly' : 'website_event_stats_daily';
return rawQuery( return rawQuery(
` `
@ -55,8 +56,8 @@ async function clickhouseQuery(
from ( from (
select select
${getDateSQL('created_at', unit, timezone)} as t, ${getDateSQL('created_at', unit, timezone)} as t,
count(distinct session_id) as y uniq(session_id) as y
from website_event from ${table} website_event
where website_id = {websiteId:UUID} where website_id = {websiteId:UUID}
and created_at between {startDate:DateTime64} and {endDate:DateTime64} and created_at between {startDate:DateTime64} and {endDate:DateTime64}
and event_type = {eventType:UInt32} and event_type = {eventType:UInt32}