Unstructured text file read in Python to make it structured

Question:

File I have this attached text file which contains unstructured data with some information lines before. How can i structurize this data (extract information in structured manner). So in the end i have several columns (in this case 5) and have the corresponding information in that. Frame 50 contains 10 values, Frame 51 contains 10 vales and so on and also get the first 4 lines value in separate.
I tried and come up with the following code. But this is not the best list/array i get.
Thanks

frame =[]
frame1 =[]

flag = -1
counter = -1
counter_val = 0
f = open(filepath, "r")
for line in f:
    element = line.split(' ')
    if(len(element) == 4):
        if(element[1] == "Frame_Number") :
            # print(element[1])
            if(flag == 0):
                # print(len(frame1))
                frame.append(frame1)
            flag = 0
            counter = counter + 1
            counter_val = 0
            frame1 =[]
        continue
    if(flag == 0):   
        frame1.append(line)
        counter_val = counter_val + 1

print(frame[1])
Asked By: A R.

||

Answers:

Try the following

Code

import csv

def convert_csv(filenm):
  " Produces structured data by converting to CSV file "
  
  with open(filenm, 'r') as fin,  open('out.txt', 'w') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    frames = []
    frame_vals = []
    for line in fin:
      line = line.rstrip()
      if line:
        if line[0] == "#":
          field, value = line[1:].split('=')
          field, value = field.strip(), value.strip()
          if field == 'Frame_Number':
            frames.append(value)    # current frame number
            frame_vals.append([])   # new sublist for frame values
        else:
          frame_vals[-1].append(line.strip())  # append to current frame values

    # Write header
    fnames = ['Frame_' + str(v) for v in frames]
    csv_writer.writerow(fnames)

    # write other data
    for row in zip(*frame_vals):  # transposing to get each frame in a column
      csv_writer.writerow(row)

convert_csv('testd.txt')

Test

Input: testd.txt

# Samples_per_Frame = 8192
# Chirp_Time_sec = 0.000133
# Pulse_Repetition_Time_sec = 0.00050355
# Frame_Period_sec = 0.2

# Frame_Number = 50
0.50061053
0.49938953
0.49426132
0.48962152
0.48791212
0.48937732
0.49523813
0.49914533
0.50158733
0.49914533
# Frame_Number = 51
0.50061053
0.49938953
0.49426132
0.48962152
0.48791212
0.48937732
0.49523813
0.49914533
0.50158733
0.49914533
# Frame_Number = 52
0.50793654
0.50647134
0.49841273
0.48937732
0.48644692
0.49035412
0.49768013
0.50647134
0.51282054
0.50940174
# Frame_Number = 53
0.49670333
0.49181932
0.4840049
0.48547012
0.48791212
0.49230772
0.49768013
0.49816853
0.49181932
0.48595852
# Frame_Number = 54
0.49352872
0.49597073
0.49987793
0.50354093
0.50402933
0.50036633
0.49841273
0.49743593
0.49865693
0.50012213

Output: out.txt

Frame_50 Frame_51 Frame_52 Frame_53 Frame_54
0.50061053 0.50061053 0.50793654 0.49670333 0.49352872
0.49938953 0.49938953 0.50647134 0.49181932 0.49597073
0.49426132 0.49426132 0.49841273 0.4840049 0.49987793
0.48962152 0.48962152 0.48937732 0.48547012 0.50354093
0.48791212 0.48791212 0.48644692 0.48791212 0.50402933
0.48937732 0.48937732 0.49035412 0.49230772 0.50036633
0.49523813 0.49523813 0.49768013 0.49768013 0.49841273
0.49914533 0.49914533 0.50647134 0.49816853 0.49743593
0.50158733 0.50158733 0.51282054 0.49181932 0.49865693
0.49914533 0.49914533 0.50940174 0.48595852 0.50012213

Regex Version

Changes

  • Uses Regex to identified meta data
  • Use dictionary to store field names and values
  • field names are lines that begin with ‘#’
  • field values are lines without ‘#’

Code

import re
import csv

def convert_csv(filenm):
  " Produces structured data by converting to CSV file "

  # https://stackoverflow.com/questions/3348460/csv-file-written-with-python-has-blank-lines-between-each-row
  with open(filenm, 'r') as fin,  open('out.txt', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=' ',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # Regex attern for # followed by non-digits followed by = followed by number (integer or float)
    meta_data_pattern = re.compile(r'# (D+) = (d+(?:.d+)?)')
    
    all_data = {}       # Will place unstructured data into this dictionary
                        # Lines with # will be key for dictionary
                        # Lines that follow without a '#' will be values for the dictionary key
    
    for line in fin:
        if (line := line.rstrip()):                        # remove end of line terminator
            if (m := meta_data_pattern.match(line)):       # check for match using assign expression (needs Python 3.8+)
                all_data.setdefault(f'{line}', [])         # start new append group with meta data
                
                # Update key used for current data
                last_key = next(reversed(all_data.keys())) # last key in dictionary
            else:
                all_data[last_key].append(line)             # append to current field
                
    # Remove fields with no data
    all_data = {k:v for k, v in all_data.items() if v}
    
    # Insure all fields the same length
    max_len = len(max(all_data.values(), key = len))
    
    for k, v in all_data.items():
        all_data[k] += ['NaN'] * (max_len - len(v))           # Pad all to same length
        
    # Get field names
    fnames = [f"{m.group(1).split('_')[0]}_{m.group(2)}" for field in all_data.keys() if (m:=meta_data_pattern.match(field))]
     
    # Dividing frame data into chunks by the number of frames by column
    frame_data = list(zip(*all_data.values()))

    csv_writer.writerow(fnames)  # Write header

    # write other data
    for row in frame_data:
        print(row)
        csv_writer.writerow(row)

convert_csv('test.txt')
Answered By: DarrylG

Here’s a pandas solution,

import pandas as pd

# Read in the data as a Pandas Series
df = pd.read_csv('testsd.txt', sep = 'n', header = None, squeeze = True) 

# Get the names of the eventual column names ('# Frame_Number 50', ...)
colNames = df.loc[df.str.startswith('# Frame_Number')]

# Store the first few lines of metadata in another frame and drop them from the original dataframe
meta_df = df[: colNames.index.to_list()[0]]]
df.drop(range(colNames.index.to_list()[0]), inplace = True)

# Drop the eventual column names
df.drop(colNames.index.to_list(), inplace = True)

What is left in the original dataframe should be just the data. Now reshape the dataframe. Note that this only works if every column has the same number of entries.

df = pd.DataFrame(df.values.reshape(len(colNames), int(len(df) / len(colNames))).T, columns = colNames)

The reshape function takes as arguments the desired number of rows and columns. It reshapes horizontally, so we will transpose the result. Finally if you want, add the metadata that we saved as a column of the dataframe, although you should really save it as a file someplace else.

df['meta'] = meta_df

Write the dataframe to file:

df.to_csv('testsd.csv')

Output:

enter image description here

Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.