반응형
- Array형태로 이루어진 칼럼에 like절
cardinality(filter(Col, x -> x like 'a')) > 0
- Json 형태로 이루어진 칼럼을 scalar 형태로 추출
json_extract_scalar(Col,'$.A')
- string 값 json parsing 하기
with a as (
select '{\\"popup_title\\":\\"MAX_LEVEL\\",\\"max_invitation\\":2,\\"current_invitation\\":1}' as col
-- select '{popup_title: "MAX_LEVEL", max_invitation: 2, current_invitation: 1}' as col
)
-- "{\\"popup_title\\":\\"MAX_LEVEL\\",\\"max_invitation\\":2,\\"current_invitation\\":1}"
-- "{popup_title: MAX_LEVEL, max_invitation: 2, current_invitation: 1}"
select *,
json_extract_scalar(replace(col,'\\',''),'$.current_invitation') as a
-- json_extract_scalar(col,'$.popup_title') as a
from a
- 복수의 like절
regexp_like(Col, 'A|B|C|D|E')
- Varchar -> timestamp
date_parse(Varchar,'%Y-%m-%d %H:%i:%s')
- timestamp -> varchar
date_format(timestamp,'%Y%m%d')
- timestamp에서 특정 unit 추출
extract(day from timestamp)
- 특정 week 추출
week_of_year(to_timestamp(yyyymmdd, 'yyyymmdd')) AS woy
- timestampe 계산
date_diff(unit,timestamp1,timestamp2)
- 최근 7주 조건
where yyyymmdd >= date_format(date_add('day', -42 - day_of_week(now()), now()), '%Y%m%d')
- 이중 조인(두가지 조건이 맞아 떨어져야 조인 가능)
table a join table b on a.1 = b.1 and a.2 = b.2
- 첫 주차 월요일 추출
to_char(date_trunc('week', date(current_timestamp AT TIME ZONE 'Asia/Seoul')), 'yyyymmdd')
- NULL값 변경
coalesce(col,'변환값')
- 문자열 대치
replace(cols,'바꿀 문자','바뀔 문자')
- Skewed 정도 확인
skewness(cols)
- 상관계수(수치형 col)
corr(col1,col2)
- 표준편차
stddev_pop(col)
- 중앙값
approx_percentile(col,0.5)
- Log화
ln(col)
log2(col)
log10(col)
- 특정 문자 추출
substr(col,시작(1),끝(6))
-> 1부터 6번째까지의 문자열이 추출됨 substr(heyjude,1,6) = 'heyjud'
- # 제외하는법
-
, COALESCE(LENGTH(regexp_replace(description, '#[^\\s]+', '')), 0) AS text_len_wo_hash
- HTML 태그를 제거하고 텍스트의 본문 길이와 줄 수를 측정
WITH cleaned_data AS (
SELECT
description,
regexp_replace(description, '<[^>]*>', '') AS cleaned_description
FROM
table_name
),
final_data AS (
SELECT
cleaned_description,
LENGTH(cleaned_description) AS full_text_length,
COALESCE(LENGTH(regexp_replace(cleaned_description, '#[^\\s]+', '')), 0) AS text_length_without_hashtags,
LENGTH(regexp_replace(cleaned_description, '[^\\n]', '')) + 1 AS line_count,
COALESCE(LENGTH(regexp_replace(regexp_replace(cleaned_description, '(^|\\n)#[^\\n]*', ''), '[^\\n]', '')) + 1, 0) AS line_count_without_hashtags
FROM
cleaned_data
)
SELECT
description,
full_text_length,
text_length_without_hashtags,
line_count,
line_count_without_hashtags
FROM
final_data;
- 줄 수 (line count) 세는법
, COALESCE(LENGTH(description) - LENGTH(regexp_replace(description, '\\n', '')) + 1, 0) AS line_count
from dump.card_collections CC
반응형
댓글