Skip to content

Commit ef3367d

Browse files
author
Fortina Elena
committed
Updated some comments
1 parent de4c01d commit ef3367d

File tree

1 file changed

+13
-13
lines changed

1 file changed

+13
-13
lines changed

table_analyzer.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def run(argv = None):
117117
pipeline_options.view_as(SetupOptions).save_main_session = True # see https://beam.apache.org/releases/pydoc/2.7.0/_modules/apache_beam/io/gcp/pubsub_it_pipeline.html
118118
p = beam.Pipeline(options = pipeline_options)
119119

120-
# Retrieve list of files to process.
120+
# Retrieve list of Avro files to process.
121121
# If another analysis has already been run, only the new files are analyzed and the results are then merged
122122
# with those from the last analysis in an incremental way.
123123
files_to_process = ['gs://{}'.format(fpath) for fpath in fs.walk(known_args.input_bucket)]
@@ -141,22 +141,22 @@ def run(argv = None):
141141
(p
142142
| 'get_files_list' >> beam.Create(files_to_process) # create PCollection containing the names of all files to process
143143
| 'read_files' >> beam.io.avroio.ReadAllFromAvro() # returns all rows in all files as dictionaries
144-
# {<column name> --> <column value at the row>}
145-
| 'parse_and_classify_rows' >> beam.ParDo(ParseRowDoFn()).with_outputs() # applies method ParseRowDoFn to each row
144+
# {column name : column value at the row}
145+
| 'parse_and_classify_rows' >> beam.ParDo(ParseRowDoFn()).with_outputs() # applies method process in ParseRowDoFn to each row
146146
)
147147

148148
valid_inputs = processed_input[None] # main output
149149
invalid_times = processed_input[ParseRowDoFn.OUTPUT_TAG_INVALID] # secondary output: list of invalid insertion times
150150

151-
# PIPELINE BRANCH 1: count distinct values with min/max insertion time; filter 10 most frequent values for each column
151+
# PIPELINE BRANCH 1: count distinct values with min/max insertion time; filter 10 most frequent values for each column.
152152
# This performs the equivalent of:
153153
# select
154154
# col, vtype, value
155155
# count(*),
156-
# min(record_insertion_time), # first time when value has appeared in this column
156+
# min(record_insertion_time), # first time when value has appeared in column col
157157
# max(record_insertion_time)
158158
# from
159-
# {source data}
159+
# valid_inputs
160160
# group by
161161
# col, vtype, value
162162
# Inspired by: https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/combiners.py
@@ -180,9 +180,9 @@ def merge_accumulators(self, accumulators):
180180
def extract_output(self, accumulator):
181181
return accumulator
182182

183-
# Input: ((col, vtype, value), record_insertion_time)
183+
# Input: elements in valid_inputs, which have the form ((col, vtype, value), record_insertion_time)
184184
# CountMinMaxCombineFn is called for each (col, vtype, value) and operates on element = record_insertion_time.
185-
# Output: ((col, vtype, value), (count, min_record_insertion_time, max_record_insertion_time))
185+
# Output: ((col, vtype, value), (counts, min_record_insertion_time, max_record_insertion_time))
186186
# (the output is grouped by key = (col, vtype, value))
187187
distinct_values = (valid_inputs
188188
| 'value_counts' >> beam.CombinePerKey(CountMinMaxCombineFn())
@@ -239,8 +239,8 @@ def merge_valuecounts(x):
239239
def remap_for_filter(x): # input: ((col, vtype, value), (counts, mintime, maxtime))
240240
ctv, values = x
241241
return (ctv[0], ctv[1]), (values[0], (ctv[2], values[1], values[2])) # <--- beam.combiners.Top.LargestPerKey requires arguments
242-
# to be provided in this form (in order to filter largest
243-
# by counts):
242+
# to be provided in this form
243+
# (in order to filter largest by counts):
244244
# ((col, vtype), (counts, (value, mindate, maxdate)))
245245

246246
top_values = (distinct_values
@@ -260,7 +260,7 @@ def format_top_values(x):
260260
result += header + value +'\t'+unicode(counts)+'\t'+unicode(mindate)+'\t'+unicode(maxdate)+'\r\n'
261261
result = result[:-2] # remove last \r\n
262262
return result
263-
# result looks like this (for one single input)
263+
# result looks like this (for one single input x): top 10 values
264264
# col vtype value counts1 mindate1 maxdate1
265265
# col vtype value counts2 mindate2 maxdate2
266266
# ...
@@ -271,15 +271,15 @@ def format_top_values(x):
271271
| 'save_top_values' >> beam.io.WriteToText(known_args.output_bucket+'topvalues')
272272
)
273273

274-
# PIPELINE BRANCH 2: count total and null values by period (= year/month in this case)
274+
# PIPELINE BRANCH 2: count total and null values by period (= year/month in this case).
275275
# Performs the equivalent of:
276276
# select
277277
# col,
278278
# year-month,
279279
# count(*),
280280
# sum(if(value is null or value = '', 1, 0))
281281
# from
282-
# {input data}
282+
# valid_input
283283
# group by
284284
# col,
285285
# year-month

0 commit comments

Comments
 (0)