Skip to content

Commit 69fe48b

Browse files
authored
Release 1.3.2 (#12)
* Release 1.3.2 -writing to single-file CSV now includes only 1 (the proper) header -Added SUM_DECIMAL_NUMBER AVG_DECIMAL_NUMBER STD_DECIMAL_NUMBER metrics * Update versions and assembly guidelines, now Spark supported version is 2.4.0
1 parent cb5c7be commit 69fe48b

File tree

22 files changed

+498
-142
lines changed

22 files changed

+498
-142
lines changed

.gitignore

Lines changed: 87 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,129 @@
1-
# Created by https://www.gitignore.io/api/intellij,scala,sbt
2-
3-
### Intellij ###
1+
# IntelliJ IDEA
2+
.idea
3+
4+
# Spark tmp dir
5+
tmp
6+
7+
#Docker Data
8+
docker/data
9+
10+
### macOS template
11+
# General
12+
*.DS_Store
13+
.AppleDouble
14+
.LSOverride
15+
16+
# Icon must end with two \r
17+
Icon
18+
19+
# Thumbnails
20+
._*
21+
22+
# Files that might appear in the root of a volume
23+
.DocumentRevisions-V100
24+
.fseventsd
25+
.Spotlight-V100
26+
.TemporaryItems
27+
.Trashes
28+
.VolumeIcon.icns
29+
.com.apple.timemachine.donotpresent
30+
31+
# Directories potentially created on remote AFP share
32+
.AppleDB
33+
.AppleDesktop
34+
Network Trash Folder
35+
Temporary Items
36+
.apdisk
37+
### JetBrains template
438
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
539
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
640

741
# User-specific stuff:
8-
.idea/workspace.xml
9-
.idea/tasks.xml
42+
.idea/**/workspace.xml
43+
.idea/**/tasks.xml
44+
.idea/dictionaries
1045

1146
# Sensitive or high-churn files:
12-
.idea/dataSources/
13-
.idea/dataSources.ids
14-
.idea/dataSources.xml
15-
.idea/dataSources.local.xml
16-
.idea/sqlDataSources.xml
17-
.idea/dynamic.xml
18-
.idea/uiDesigner.xml
47+
.idea/**/dataSources/
48+
.idea/**/dataSources.ids
49+
.idea/**/dataSources.xml
50+
.idea/**/dataSources.local.xml
51+
.idea/**/sqlDataSources.xml
52+
.idea/**/dynamic.xml
53+
.idea/**/uiDesigner.xml
1954

2055
# Gradle:
21-
.idea/gradle.xml
22-
.idea/libraries
56+
.idea/**/gradle.xml
57+
.idea/**/libraries
58+
59+
# CMake
60+
cmake-build-debug/
2361

2462
# Mongo Explorer plugin:
25-
.idea/mongoSettings.xml
63+
.idea/**/mongoSettings.xml
2664

2765
## File-based project format:
2866
*.iws
2967

3068
## Plugin-specific files:
3169

3270
# IntelliJ
33-
/out/
71+
out/
3472

3573
# mpeltonen/sbt-idea plugin
3674
.idea_modules/
3775

3876
# JIRA plugin
3977
atlassian-ide-plugin.xml
4078

79+
# Cursive Clojure plugin
80+
.idea/replstate.xml
81+
4182
# Crashlytics plugin (for Android Studio and IntelliJ)
4283
com_crashlytics_export_strings.xml
4384
crashlytics.properties
4485
crashlytics-build.properties
4586
fabric.properties
87+
### Java template
88+
# Compiled class file
89+
*.class
4690

47-
### Intellij Patch ###
48-
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
91+
# Log file
92+
*.log
4993

50-
# *.iml
51-
# modules.xml
52-
# .idea/misc.xml
53-
# *.ipr
94+
# BlueJ files
95+
*.ctxt
5496

97+
# Mobile Tools for Java (J2ME)
98+
.mtj.tmp/
5599

56-
### Scala ###
57-
*.class
58-
*.log
100+
# Package Files #
101+
*.jar
102+
*.war
103+
*.ear
104+
*.zip
105+
*.tar.gz
106+
*.rar
107+
108+
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
109+
hs_err_pid*
110+
### Scala template
111+
### SBT template
112+
# Simple Build Tool
113+
# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
59114

60-
# sbt specific
61-
.cache
62-
.history
63-
.lib/
64115
dist/*
65116
target/
66117
lib_managed/
67118
src_managed/
68119
project/boot/
69120
project/plugins/project/
70-
project/target/*
71-
project/project/target/
72-
73-
# Scala-IDE specific
74-
.scala_dependencies
75-
.worksheet
76-
77-
# ENSIME specific
78-
.ensime_cache/
79-
.ensime
80-
81-
82-
### SBT ###
83-
# Simple Build Tool
84-
# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
85-
121+
.history
122+
.cache
123+
.lib/
124+
sbt-cache
125+
repositories
86126

87-
# End of https://www.gitignore.io/api/intellij,scala,sbt
88-
.idea/
89127

90-
tmp/*
128+
# lab configuration file
129+
.lab.cache

.gitlab-ci.yml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,11 @@ stages:
1414
- test
1515
- assembly
1616

17-
test-1.6:
18-
stage: test
19-
script:
20-
- sbt -Dfile.encoding=UTF-8 "project core" 'set sparkVersion:="1.6.0"' 'testOnly * -- -n it.agilelab.bigdata.DataQuality.Spark1xTest'
21-
2217
test-2.4:
2318
stage: test
2419
script:
25-
- sbt -Dfile.encoding=UTF-8 "project core" 'set sparkVersion:="2.4.0"' 'testOnly * -- -n it.agilelab.bigdata.DataQuality.Spark2xTest'
26-
27-
assembly-1.6:
28-
stage: assembly
29-
script:
30-
- sbt "project core" 'set sparkVersion:="1.6.0"' assembly
20+
- ./scripts/test-only-spark2.sh
21+
coverage: '/Statement coverage.: (.*\%)/'
3122

3223
assembly-2.4:
3324
stage: assembly

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ You can improve it by sending pull requests to this repository.
6464
## Installation
6565

6666
Data Quality is currently using following dependencies:
67-
- Scala 2.10(core)/2.11(ui)
68-
- Apache Spark 1.6
67+
- Scala 2.11.12
68+
- Apache Spark 2.4.0
6969
- PostgreSQL 9.3 (works also with Oracle and SQLite)
7070

7171
To be able to use all the features of Data Quality you'll need to setup a database
@@ -78,12 +78,12 @@ All modules of DQ works independently.:
7878

7979
## Building CORE module
8080

81-
Data Quality core module can be built with 2 different versions of Spark (1.6.0, 2.2.0). By default if will select 1.6.0. In order to build with Spark 2 set Multiversion.sparkVersion as in following snippet:
81+
From the sbt console opened in the root project issue the following commands:
8282
```
83-
- set Multiversion.sparkVersion := "2.2.0"
8483
- project core
8584
- assembly
8685
```
86+
This should generate the core artifact in `<your-project-dir>/dq-core/target/scala-<scala-version>/dq-core_<spark-major.minor>_<scala-major.minor.patch>-<dq-version>.jar`
8787

8888
## Examples
8989

build.sbt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import com.typesafe.sbt.SbtNativePackager.autoImport.NativePackagerHelper._
33
import sbt.Keys.scalaVersion
44

55
ThisBuild / organization := "it.agilelab"
6-
ThisBuild / version := "1.3.0-SNAPSHOT"
6+
ThisBuild / version := "1.3.2"
77

88
scalacOptions ++= Seq(
99
"-target:jvm-1.8",
@@ -80,7 +80,6 @@ lazy val core = (project in file("dq-core"))
8080
}
8181
((resourceDirectory in Compile).value / confFile) -> "conf/application.conf"
8282
},
83-
8483
Universal / mappings ++= {
8584
val integrationFolder = integrationEnv.value match {
8685
case _ => "integration/dev"
@@ -134,4 +133,4 @@ lazy val be = (project in file("dq-be"))
134133
libraryDependencies ++= {
135134
Seq(jdbc, cache, ws, specs2 % Test, evolutions, guice) ++ Dependencies.dq_be
136135
}
137-
).dependsOn(api,common)
136+
).dependsOn(api,common)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
checkId,checkName,description,checkedFile,baseMetric,comparedMetric,comparedThreshold,status,message,execDate
2+
depth_avg_check,GREATER_THAN,Checks is average of depth is greather than 10,USGS_2000,depth_avg,,50.0,Success,Check depth_avg_check for metric AVG_NUMBER on column USGS_2000[Buffer(Depth)] check if (MetricResult) 85.05389330922242 is GREATER_THAN 50.0 (compareMetric/threshold). Result: Success. CheckStatus: 85.05389330922242 > 50.0.,2019-01-01
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
checkId,checkName,description,checkedFile,baseMetric,comparedMetric,comparedThreshold,status,message,execDate
2+
depth_avg_check,GREATER_THAN,Checks is average of depth is greather than 10,USGS_2000,depth_avg,"",50.0,Success,Check depth_avg_check for metric AVG_NUMBER on column USGS_2000[Buffer(Depth)] check if (MetricResult) 85.05389330922242 is GREATER_THAN 50.0 (compareMetric/threshold). Result: Success. CheckStatus: 85.05389330922242 > 50.0.,2019-01-01

docs/installation/ui-setup.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
####DataQuality - UI
22
The UI to create configuration for Data Quality framework
33

4-
## Quick Start
4+
5+
##Quick Start
6+
57
###Prerequisites
68

79
DataQuality UI has some requirements:

docs/metrics.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,38 @@ Returns sum of all numerical values in the column
137137

138138
Parameters: none
139139

140+
##### "SUM_DECIMAL_NUMBER"
141+
Returns sum of all numerical values in the column
142+
143+
In financial use case, this metric is recommended over SUM_NUMBER
144+
145+
Parameters: none
146+
140147
##### "AVG_NUMBER"
141148
Returns average of all numerical value inside of the column
142149

143150
Parameters: none
144151

152+
##### "AVG_DECIMAL_NUMBER"
153+
Returns average of all numerical value inside of the column
154+
155+
In financial use case, this metric is recommended over AVG_NUMBER
156+
157+
Parameters: none
158+
145159
##### "STD_NUMBER"
146160
Return standard deviation of all numerical values inside of the column
147161

148162
Parameters: none
149163

164+
##### "STD_DECIMAL_NUMBER"
165+
166+
Return standard deviation of all numerical values inside of the column
167+
168+
In financial use case, this metric is recommended over STD_NUMBER
169+
170+
Parameters: none
171+
150172
##### "MIN_STRING"
151173
Return string minimum of the column (mostly for comparing dates)
152174

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
data_quality {
2+
3+
application_name: "local"
4+
run_configuration_version: 1.0
5+
hive_warehouse_path: ""
6+
hbase_host: ""
7+
8+
tmp_files_management: {
9+
local_fs_path: "/tmp/fs"
10+
hdfs_path: "/tmp/hdfs"
11+
}
12+
13+
metric_error_management: {
14+
dump_directory_path: "/tmp/dump"
15+
dump_size: 1000 // max number of collected errors for 1 metric for 1 partition
16+
empty_file: true
17+
file_config: {
18+
format: "csv"
19+
delimiter: ","
20+
quote: "\""
21+
escape: "\\"
22+
quote_mode: "ALL"
23+
}
24+
}
25+
26+
virtual_sources_management: {
27+
dump_directory_path: "/tmp/virtual"
28+
file_format: "csv"
29+
delimiter: ","
30+
}
31+
32+
// Result storage configuration
33+
// Supported types: "DB", "NONE"
34+
// Use "" to turn off storage feature
35+
// "DB" subtypes: "SQLITE", "POSTGRES", "ORACLE
36+
storage:{
37+
type: "NONE"
38+
config: {
39+
subtype: "POSTGRES"
40+
host: "localhost:5433/dataquality"
41+
user: "postgres"
42+
password: "postgres"
43+
schema: "dev"
44+
}
45+
}
46+
47+
// Check failure alert mailer configuration
48+
mailing {
49+
// "external" - to use external SMTP server
50+
// "internal" - to use internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration)
51+
// "" - to turn off mailing
52+
mode: "internal"
53+
mail_script_path: ""
54+
// config: {
55+
// address: "[email protected]"
56+
// hostname: "smtp.gmail.com"
57+
// username: "test.testovic"
58+
// password: "password123"
59+
// smtpPort: 465
60+
// sslOnConnect: true
61+
// }
62+
63+
notifications: false
64+
}
65+
}

0 commit comments

Comments
 (0)