bufr.pm:bufrextract.pl_source

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
bufr.pm:bufrextract.pl_source [2022-05-31 09:29:31]
external edit
bufr.pm:bufrextract.pl_source [2026-03-26 17:56:54] (current)
pals
Line 2: Line 2:
 #!/usr/bin/perl #!/usr/bin/perl
  
-# (C) Copyright 2010-2019 MET Norway+Copyright (C) 2010-2026 MET Norway
 # #
 # This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
Line 30: Line 30:
 my %option = (); my %option = ();
 GetOptions( GetOptions(
-           \%option, +    \%option, 
-           'ahl=s',        # Extract BUFR messages with AHL matching <ahl_regexp> only +    'ahl=s',           # Extract BUFR messages with AHL matching <ahl_regexp> only 
-           'help',         # Print help information and exit +    'gts',             # Include full gts message envelope if present 
-           'only_ahl',     # Extract AHLs only +    'filter=s',        # Extract BUFR messages meeting the <metadata criteria> only 
-           'outfile=s',    # Print to file instead of STDOUT +    'help',            # Print help information and exit 
-           'reuse_ahl=i',  # Reuse last AHL if current BUFR message has no AHL +    'only_ahl',        # Extract AHLs only 
-           'verbose=i',    # Set verbose level to n, 0<=n<=6 (default 0) +    'outfile=s',       # Print to file instead of STDOUT 
-           'without_ahl',  # Print the BUFR messages only, skipping AHLs +    'verbose=i',       # Set verbose level to n, 0<=n<=6 (default 0) 
-       ) or pod2usage(-verbose => 0);+    'without_ahl',     # Print the BUFR messages only, skipping AHLs 
 +    ) or pod2usage(-verbose => 0);
  
 # User asked for help # User asked for help
Line 44: Line 45:
  
 # only_ahl and without_ahl are mutually exclusive # only_ahl and without_ahl are mutually exclusive
-pod2usage( -message => "Options only_ahl and without_ahl are mutually exclusive",+pod2usage( -message => "Options only_ahlwithout_ahl and gts are mutually exclusive",
            -exitval => 2,            -exitval => 2,
            -verbose => 0)            -verbose => 0)
-    if $option{only_ahl} && $option{without_ahl};+    if ( ($option{only_ahl} && ($option{without_ahl} || $option{gts})) 
 +         || ($option{without_ahl} && ($option{only_ahl} || $option{gts})) 
 +         || ($option{gts} && ($option{only_ahl} || $option{without_ahl})) );
  
 # Make sure there is at least one input file # Make sure there is at least one input file
Line 55: Line 58:
 Geo::BUFR->set_verbose($option{verbose}) if $option{verbose}; Geo::BUFR->set_verbose($option{verbose}) if $option{verbose};
  
-Set whether last ahl should be reused if current BUFR message has no AHL +For filtering on ahl
-Geo::BUFR->reuse_current_ahl($option{reuse_ahl}) if defined $option{reuse_ahl}; +
 my $ahl_regexp; my $ahl_regexp;
 if ($option{ahl}) { if ($option{ahl}) {
Line 63: Line 64:
     die "Argument to --ahl is not a valid Perl regular expression: $@" if $@;     die "Argument to --ahl is not a valid Perl regular expression: $@" if $@;
 } }
 +
 +# For filtering on metadata in section 0/1
 +my $filter = $option{filter} ? $option{filter} : '';
 +my $or_criteria_ref = get_filter_criteria($filter);
  
 # Where to direct output (including verbose output, but not output to STDERR) # Where to direct output (including verbose output, but not output to STDERR)
Line 80: Line 85:
 foreach my $inputfname ( @ARGV ) { foreach my $inputfname ( @ARGV ) {
     my $bufr = Geo::BUFR->new();     my $bufr = Geo::BUFR->new();
-    $bufr->set_filter_cb(\&filter_on_ahl,$ahl_regexp) if $option{ahl};+ 
 +    # Could alternatively have merged filtering on ahl and metadata into 
 +    # one single callback function, but that would be a rather complex 
 +    # one, so we prefer to do the filtering on metadata later 
 +    $bufr->set_filter_cb(\&filter_on_ahl, $ahl_regexp) if $option{ahl};
  
     # Open BUFR file     # Open BUFR file
Line 124: Line 133:
         }         }
  
-        next if $option{ahl} && $bufr->is_filtered();+ # Filtering on ahl 
 +        next READLOOP if $option{ahl} && $bufr->is_filtered(); 
 + 
 + # Filtering on metadata 
 +        next READLOOP if $or_criteria_ref && not or_filter($bufr, $or_criteria_ref); 
         # Skip messages where stated length of BUFR message is sure to         # Skip messages where stated length of BUFR message is sure to
         # be erroneous, unless we want ahls only (or should we skip         # be erroneous, unless we want ahls only (or should we skip
         # message in this case also? Hard choice...)         # message in this case also? Hard choice...)
-        next if !$option{only_ahl} && $bufr->bad_bufrlength();+        next READLOOP if !$option{only_ahl} && $bufr->bad_bufrlength();
  
         my $current_subset_number = $bufr->get_current_subset_number();         my $current_subset_number = $bufr->get_current_subset_number();
Line 137: Line 151:
         $current_message_number = $bufr->get_current_message_number();         $current_message_number = $bufr->get_current_message_number();
         $current_ahl = $bufr->get_current_ahl() || '';         $current_ahl = $bufr->get_current_ahl() || '';
 +        my $gts_eom = '';
  
-        if ($current_ahl && !$bufr->ahl_is_reused()) {+        if ($current_ahl) {
             if ($option{only_ahl}) {             if ($option{only_ahl}) {
                 print $OUT $current_ahl, "\n";                 print $OUT $current_ahl, "\n";
             } elsif (!$option{without_ahl}) {             } elsif (!$option{without_ahl}) {
 +                if ($option{gts}) {
 +                    my $current_gts_starting_line = $bufr->get_current_gts_starting_line() || '';
 +                    print $OUT $current_gts_starting_line;
 +                    $gts_eom = $bufr->get_current_gts_eom() || '';
 +                }
                 # Use \r\r\n after AHL, since this is the standard                 # Use \r\r\n after AHL, since this is the standard
                 # sequence used in GTS bulletins                 # sequence used in GTS bulletins
Line 150: Line 170:
  
         my $msg = $bufr->get_bufr_message();         my $msg = $bufr->get_bufr_message();
-        print $OUT $msg; +        print $OUT $msg, $gts_eom
-    }+  }
 } }
  
Line 162: Line 182:
     return $ahl =~ $ahl_regexp ? 0 : 1;     return $ahl =~ $ahl_regexp ? 0 : 1;
 } }
 +
 +# Get the list of alternative metadata criteria (these are separated
 +# by '|', see pod)
 +sub get_filter_criteria {
 +    my $filter = shift;
 +    return ('') if ! $filter;
 +
 +    my @or_criteria;
 +    my @criteria = split /[|]/, $filter;
 +    foreach my $cr (@criteria) {
 +        $cr =~ s/^\s+//;
 +        $cr =~ s/\s+$//;
 +        if ($cr ne '') {
 +            push @or_criteria, $cr;
 +        }
 +    }
 +    return \@or_criteria;
 +}
 +
 +# Return true (1) if the BUFR message is matching all @and_criteria
 +# (to be extracted) for at least one of the @or_criteria
 +sub or_filter {
 +    my ($bufr, $or_criteria_ref) = @_;
 +
 +    my $be = $bufr->get_bufr_edition() || return 0;
 +    my $dc = $bufr->get_data_category();
 +    # Choose to equate data_subcategory with int_data_subcategory, but
 +    # not quite sure about this
 +    my $ic = ($be == 4) ? $bufr->get_int_data_subcategory()
 +                        : $bufr->get_data_subcategory();
 +    my $lc = $bufr->get_loc_data_subcategory();
 +    my $oc = $bufr->get_centre();
 +    my $os = $bufr->get_subcentre();
 +    my $mt = $bufr->get_master_table_version();
 +    my $lt = $bufr->get_local_table_version();
 +    # This will not work for edition 3 when year is before 2000,
 +    # but hard to find a better way...
 +    my $ye = ($be == 4) ? $bufr->get_year()
 +                        : $bufr->get_year_of_century + 2000;
 +    my $mo = $bufr->get_month();
 +    my $da = $bufr->get_day();
 +    my $ho = $bufr->get_hour();
 +    my $mi = $bufr->get_minute();
 +    my $se = ($be == 4) ? $bufr->get_second() : 0;
 +
 +    my $include = 0;
 +  OR:
 +    foreach my $or_criterium (@$or_criteria_ref) {
 +        my $all_ok = 1;
 +        my @and_criteria = split /\s+/, $or_criterium;
 +      AND:
 +        foreach my $and_criterium (@and_criteria) {
 +            my ($c, $list) = split /=/, $and_criterium;
 +            my @list = split /,/, $list;
 +            if ($c eq 'be') {
 +                if (not grep { $_ eq $be } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'dc') {
 +                if (not grep { $_ eq $dc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ic') {
 +                if (not grep { $_ eq $ic } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'lc') {
 +                # Not in BUFR edition 3
 +                if (!(defined $lc) || not grep { $_ eq $lc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'oc') {
 +                if (not grep { $_ eq $oc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'os') {
 +                if (not grep { $_ eq $os } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mt') {
 +                if (not grep { $_ eq $mt } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'lt') {
 +                if (not grep { $_ eq $lt } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ye') {
 +                if (not grep { $_ eq $ye } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mo') {
 +                if (not grep { $_ eq $mo } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'da') {
 +                if (not grep { $_ eq $da } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ho') {
 +                if (not grep { $_ eq $ho } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mi') {
 +                if (not grep { $_ eq $mi } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'se') {
 +                if (not grep { $_ eq $se } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            }  else {
 +                die "Metadata '$c' not recognized, check `perldoc bufrextract.pl`"
 +                    . " for the full list of 2-letter abbreviations accepted!";
 +            }
 +      } # end AND
 +        if ($all_ok == 1) {
 +            # BUFR message has met all conditions in this
 +            # or-criterium, so no need to check the others
 +            $include = 1;
 +            last OR;
 +      }
 +
 +  } # end OR
 +
 +    return $include;
 +}
 +
  
  
Line 172: Line 334:
   bufrextract.pl <bufr file(s)>   bufrextract.pl <bufr file(s)>
       [--ahl <ahl_regexp>]       [--ahl <ahl_regexp>]
-      [--only_ahl[--without_ahl]+      [--only_ahl | --without_ahl | --gts] 
 +      [--filter <metadata criteria>]
       [--outfile <filename>]       [--outfile <filename>]
-      [--reuse_ahl n] 
       [--help]       [--help]
       [--verbose n]       [--verbose n]
Line 181: Line 343:
  
 Extract all BUFR messages and/or corresponding AHLs from BUFR file(s), Extract all BUFR messages and/or corresponding AHLs from BUFR file(s),
-possibly filtering on AHL.+possibly filtering on AHL and/or metadata in section 1.
  
-The AHL (Abbreviated Header Line) is recognized as the TTAAii CCCC DTG +The AHL (Abbreviated Header Line) is recognized as the TTAAii CCCC 
-[BBB] immediately preceding the BUFR message.+YYGGgg [BBB] immediately preceding the BUFR message.
  
 Execute without arguments for Usage, with option C<--help> for some Execute without arguments for Usage, with option C<--help> for some
Line 195: Line 357:
    --ahl <ahl_regexp> Extract BUFR messages and/or AHLs with AHL    --ahl <ahl_regexp> Extract BUFR messages and/or AHLs with AHL
                       matching <ahl_regexp> only                       matching <ahl_regexp> only
 +   --gts              Include full gts message envelope if present
    --only_ahl         Extract AHLs only    --only_ahl         Extract AHLs only
    --without_ahl      Extract BUFR messages only    --without_ahl      Extract BUFR messages only
 +   --filter <metadata criteria>
 +                      Extract BUFR messages matching the <metadata criteria> only
    --outfile <filename>    --outfile <filename>
                       Will print to <filename> instead of STDOUT                       Will print to <filename> instead of STDOUT
-   --reuse_ahl n  n=0 (default) AHL is considered belonging to a BUFR message 
-                      only if immediately preceding 
-                  n=1 When filtering using --ahl: Reuse last AHL found if current 
-                      BUFR message has no immediately preceding AHL 
    --help             Display Usage and explain the options used. For even    --help             Display Usage and explain the options used. For even
                       more info you might prefer to consult perldoc bufrextract.pl                       more info you might prefer to consult perldoc bufrextract.pl
Line 210: Line 371:
  
 For option C<--ahl> the <ahl_regexp> should be a Perl regular For option C<--ahl> the <ahl_regexp> should be a Perl regular
-expression. E.g. C<--ahl 'ISS... ENMI'> will decode only BUFR SHIP+expression. E.g. C<--ahl 'ISS... ENMI'> will decode only SHIP BUFR
 (ISS) from CCCC=ENMI. (ISS) from CCCC=ENMI.
  
-If the BUFR file(s) are known to consist solely of GTS bulletins, you +Use option C<--gtsif you want the full GTS message envelope (if 
-might consider setting C<--reuse 1when applying C<--ahl>, in order +present) to be included in outputThere are 2 main variations on this 
-to extract all (and not only the first) BUFR messages in multi message +envelope (SOH/ETX and ZCZC notation), for details see the Manual on 
-bulletinsSuch bulletins are very rare nowadays, however, and see +the GTS: Attachment II-4. Format of Meteorological Messages.
-also the L</"CAVEAT"> for more on this option. Note that the +
-corresponding AHL is still extracted (and printedonly once.+
  
-No bufrtables are needed for running bufrextract.pl, since section 4 +Using C<--filter> makes it possible to filter based on almost any of 
-in BUFR message will not be decoded (which also speeds up execution +the metadata present in section 1 (and 0) of the BUFR messages. Some few 
-quite a bit).+examples which hopefully are enough to illustrate how to write the 
 +<metadata criteria>: according to Common Code Table C-13 of 
 +WMO-no. 306, "dc=0 ic=0,1,2,6" should take care of synoptic and 
 +one-hour observations from fixed-land stations, while "dc=1 ic=0,6" 
 +should do the same for marine stations. If you want to extract both, 
 +use for <metadata criteria>: "dc=0 ic=0,1,2,6 | dc=1 ic=0,6"
 + 
 +Here is the full list of metadata available for filtering (the first 
 +2-letter abbreviation is what should be used in the <metadata criteria>): 
 + 
 +  be = BUFR edition 
 +  oc = Originating centre 
 +  os = Originating subcentre 
 +  dc = Data category (table A) 
 +  ic = International data subcategory 
 +  lc = Local data subcategory 
 +  mt = Master table version number 
 +  lt = Local table version number 
 +  ye = Year 
 +  mo = Month 
 +  da = Day 
 +  ho = Hour 
 +  mi = Minute 
 +  se = Second 
 + 
 +Note that no bufrtables are needed for running bufrextract.pl, since 
 +section 4 in BUFR message will not be decoded (which also speeds up 
 +execution quite a bit).
  
 =head1 HINTS =head1 HINTS
Line 230: Line 416:
 section 0-3, by making your own copy of bufrextract.pl and then section 0-3, by making your own copy of bufrextract.pl and then
 employing one of the many C<get_> subroutines in BUFR.pm. For example, employing one of the many C<get_> subroutines in BUFR.pm. For example,
-to extract only BUFR messages with data category 1, add the following+to extract only BUFR messages with TM315009, add the following
 line just before calling C<is_filtered()> in code: line just before calling C<is_filtered()> in code:
  
-  next if $bufr->get_data_category() != 1; +  next if $bufr->get_descriptors_unexpanded() ne '315009';
- +
-Or to extract BUFR messages with TM315009 only: +
- +
-  next if bufr->get_descriptors_unexpanded() ne '315009';+
  
 =head1 CAVEAT =head1 CAVEAT
Line 244: Line 426:
 between the GTS AHL and the start of BUFR message (besides the between the GTS AHL and the start of BUFR message (besides the
 standard character sequence CRCRLF), likely leading bufrextract.pl to standard character sequence CRCRLF), likely leading bufrextract.pl to
-miss the AHL. Also, if applying C<--reuse 1>, the BUFR message of such +miss the AHL.
-a GTS bulletin will then be wrongly associated with the AHL of the +
-previous GTS bulletin when filtering on AHL. If bulletins with this +
-kind of error is more of a concern than multi message bulletins, you +
-should probably refrain from making use of the C<--reuse 1> option.+
  
 =head1 AUTHOR =head1 AUTHOR
Line 256: Line 434:
 =head1 COPYRIGHT =head1 COPYRIGHT
  
-Copyright (C) 2010-2019 MET Norway+Copyright (C) 2010-2026 MET Norway
  
 =cut =cut
- 
 </code> </code>
  • bufr.pm/bufrextract.pl_source.1653989371.txt.gz
  • Last modified: 2022-05-31 09:29:31
  • by